In [2]:
import os
import pandas as pd
import numpy as np
from tqdm  import tqdm
import json
import pickle
import re
import glob

In [3]:
# Set the directory path where the pickle files are located
current_dir = os.getcwd()  # gets current working directory
project_dir = os.path.dirname(current_dir)  
directory_path = os.path.dirname(current_dir) + os.sep + 'data' + os.sep

# Data preparation

In [4]:
# Get a list of all the pickle files in the directory
pickle_files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('.pkl') and f.startswith('full') == False ]

# Load and concatenate the data from each file
# create the empty dataframe with 13 columns

concatenated_data = []
for file in tqdm(pickle_files):
    with open(file, 'rb') as f:
        staged_data = pickle.load(f)
        concatenated_data.append(staged_data)

data = pd.concat(concatenated_data, ignore_index = True, axis=0)
# adding id

data.head(3)

100%|██████████| 10/10 [00:00<00:00, 151.21it/s]


Unnamed: 0,category,title,series,episode_name,description,description2,tags,image,more,tags2,publication_date,rating,duration_sec
0,Panel Discussion,Spicks And Specks,Spicks And Specks,"Series 9 Missy Higgins, Dave O'Neil, Yeo & Jud...","Join Adam Hills, Myf Warhurst and Alan Brough ...",Missy Higgins and Dave O'Neil help Alan battle...,"[ABC TV, PANEL & DISCUSSION]",https://cdn.iview.abc.net.au/thumbs/i/le/LE200...,"Hosts Adam Hills, Myf Warhurst, Alan Brough","[abc1, australia, panel, music, culture, enter...",2021-06-20 20:27:00,PG,2701
1,Panel Discussion,Would I Lie To You?,Would I Lie To You?,Series 13 Episode 4,Rob Brydon is back in the host's chair for ano...,Host Rob Brydon and team captains Lee Mack and...,"[ABC TV Plus, COMEDY, PANEL & DISCUSSION]",https://cdn.iview.abc.net.au/thumbs/i/zw/ZW213...,Host Rob Brydon,"[abc2, uk, comedy, panel, entertainment, cult-...",2022-02-08 20:30:35,PG,1737
2,Panel Discussion,Whovians,Whovians,Series 3 Episode 9,"Join Rove McManus, Tegan Higginbotham and Bajo...","Rove McManus, Tegan Higginbotham and Bajo are ...","[ABC TV Plus, COMEDY, PANEL & DISCUSSION]",https://cdn.iview.abc.net.au/thumbs/i/le/LE191...,"Hosts Rove McManus, Tegan Higginbotham, Steven...","[abc2, aussie, comedy, panel, entertainment, q...",2020-03-05 21:38:00,PG,2090


In [5]:
duplicate_filter = list(data.columns)

#duplicate_filter.remove('id')
duplicate_filter.remove('image')
duplicate_filter.remove('tags')
duplicate_filter.remove('tags2')
duplicate_filter.remove('category')

data.drop_duplicates(subset = duplicate_filter, inplace = True)

data.head(3)

Unnamed: 0,category,title,series,episode_name,description,description2,tags,image,more,tags2,publication_date,rating,duration_sec
0,Panel Discussion,Spicks And Specks,Spicks And Specks,"Series 9 Missy Higgins, Dave O'Neil, Yeo & Jud...","Join Adam Hills, Myf Warhurst and Alan Brough ...",Missy Higgins and Dave O'Neil help Alan battle...,"[ABC TV, PANEL & DISCUSSION]",https://cdn.iview.abc.net.au/thumbs/i/le/LE200...,"Hosts Adam Hills, Myf Warhurst, Alan Brough","[abc1, australia, panel, music, culture, enter...",2021-06-20 20:27:00,PG,2701
1,Panel Discussion,Would I Lie To You?,Would I Lie To You?,Series 13 Episode 4,Rob Brydon is back in the host's chair for ano...,Host Rob Brydon and team captains Lee Mack and...,"[ABC TV Plus, COMEDY, PANEL & DISCUSSION]",https://cdn.iview.abc.net.au/thumbs/i/zw/ZW213...,Host Rob Brydon,"[abc2, uk, comedy, panel, entertainment, cult-...",2022-02-08 20:30:35,PG,1737
2,Panel Discussion,Whovians,Whovians,Series 3 Episode 9,"Join Rove McManus, Tegan Higginbotham and Bajo...","Rove McManus, Tegan Higginbotham and Bajo are ...","[ABC TV Plus, COMEDY, PANEL & DISCUSSION]",https://cdn.iview.abc.net.au/thumbs/i/le/LE191...,"Hosts Rove McManus, Tegan Higginbotham, Steven...","[abc2, aussie, comedy, panel, entertainment, q...",2020-03-05 21:38:00,PG,2090


In [6]:
data['id'] = data.index
cols = data.columns.tolist()
cols = cols[-1:] + cols[:-1]
data = data[cols]

In [7]:
# clarifying naming, defining missings

data.rename({'more':'directors_actors'}, axis = 1, inplace = True)
data.replace(['No data found', 'No more information found', 
              'No title found', 'No tags found', 'No image found', 'No description found'], np.nan, inplace = True)

data['directors_actors'] = data['directors_actors'].apply(str).apply(lambda x : x.split(', ') if x != 'nan' else [])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10789 entries, 0 to 13884
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                10789 non-null  int64  
 1   category          10789 non-null  object 
 2   title             10788 non-null  object 
 3   series            10396 non-null  object 
 4   episode_name      10782 non-null  object 
 5   description       10788 non-null  object 
 6   description2      10785 non-null  object 
 7   tags              10788 non-null  object 
 8   image             10788 non-null  object 
 9   directors_actors  10789 non-null  object 
 10  tags2             10785 non-null  object 
 11  publication_date  10785 non-null  object 
 12  rating            9499 non-null   object 
 13  duration_sec      10784 non-null  float64
dtypes: float64(1), int64(1), object(12)
memory usage: 1.2+ MB


In [8]:
# clarifying naming, defining missings

data.rename({'more':'directors_actors'}, axis = 1, inplace = True)
data.replace(['No data found', 'No more information found', 
              'No title found', 'No tags found', 'No image found', 'No description found'], np.nan, inplace = True)

data['directors_actors'] = data['directors_actors'].apply(str).apply(lambda x : x.split(', ') if x != 'nan' else [])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10789 entries, 0 to 13884
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                10789 non-null  int64  
 1   category          10789 non-null  object 
 2   title             10788 non-null  object 
 3   series            10396 non-null  object 
 4   episode_name      10782 non-null  object 
 5   description       10788 non-null  object 
 6   description2      10785 non-null  object 
 7   tags              10788 non-null  object 
 8   image             10788 non-null  object 
 9   directors_actors  10789 non-null  object 
 10  tags2             10785 non-null  object 
 11  publication_date  10785 non-null  object 
 12  rating            9499 non-null   object 
 13  duration_sec      10784 non-null  float64
dtypes: float64(1), int64(1), object(12)
memory usage: 1.2+ MB


In [9]:
cols = list(data.columns)
# dropping nans where it is necessary
unwanted = ['directors_actors', 'rating', 'series', 'directors_actors']
cols = [x for x in cols if x not in unwanted]

clean = data.dropna(subset = cols)
clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10781 entries, 0 to 13884
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                10781 non-null  int64  
 1   category          10781 non-null  object 
 2   title             10781 non-null  object 
 3   series            10392 non-null  object 
 4   episode_name      10781 non-null  object 
 5   description       10781 non-null  object 
 6   description2      10781 non-null  object 
 7   tags              10781 non-null  object 
 8   image             10781 non-null  object 
 9   directors_actors  10781 non-null  object 
 10  tags2             10781 non-null  object 
 11  publication_date  10781 non-null  object 
 12  rating            9497 non-null   object 
 13  duration_sec      10781 non-null  float64
dtypes: float64(1), int64(1), object(12)
memory usage: 1.2+ MB


# Creating special columns for episodes

In [10]:
seasons = []
eps = []
ep_descs = []

for index, row in clean.iterrows():
    i = index
    txt = row.episode_name
    try:
        season = re.findall('Series [0-9]{1,2}', txt)[0]
        season = re.findall('[0-9]+', season)[0]
        seasons.append(season)
    except:
        seasons.append(np.nan)
    try: 
        ep = re.findall('Episode [0-9]{1,2}', txt)[0]
        ep = re.findall('[0-9]+', ep)[0]
        eps.append(ep)
    except:
        eps.append('')
    desc = re.sub('[0-9]+', '', txt)
    desc = desc.replace('Series', '')
    desc = desc.replace('Episode', '')
    ep_descs.append(desc)

clean = clean.assign(season=seasons, episode=eps, episode_title=ep_descs)
clean.head(3)

Unnamed: 0,id,category,title,series,episode_name,description,description2,tags,image,directors_actors,tags2,publication_date,rating,duration_sec,season,episode,episode_title
0,0,Panel Discussion,Spicks And Specks,Spicks And Specks,"Series 9 Missy Higgins, Dave O'Neil, Yeo & Jud...","Join Adam Hills, Myf Warhurst and Alan Brough ...",Missy Higgins and Dave O'Neil help Alan battle...,"[ABC TV, PANEL & DISCUSSION]",https://cdn.iview.abc.net.au/thumbs/i/le/LE200...,"[['Hosts Adam Hills', 'Myf Warhurst', 'Alan Br...","[abc1, australia, panel, music, culture, enter...",2021-06-20 20:27:00,PG,2701.0,9,,"Missy Higgins, Dave O'Neil, Yeo & Judith Lucy"
1,1,Panel Discussion,Would I Lie To You?,Would I Lie To You?,Series 13 Episode 4,Rob Brydon is back in the host's chair for ano...,Host Rob Brydon and team captains Lee Mack and...,"[ABC TV Plus, COMEDY, PANEL & DISCUSSION]",https://cdn.iview.abc.net.au/thumbs/i/zw/ZW213...,[['Host Rob Brydon']],"[abc2, uk, comedy, panel, entertainment, cult-...",2022-02-08 20:30:35,PG,1737.0,13,4.0,
2,2,Panel Discussion,Whovians,Whovians,Series 3 Episode 9,"Join Rove McManus, Tegan Higginbotham and Bajo...","Rove McManus, Tegan Higginbotham and Bajo are ...","[ABC TV Plus, COMEDY, PANEL & DISCUSSION]",https://cdn.iview.abc.net.au/thumbs/i/le/LE191...,"[['Hosts Rove McManus', 'Tegan Higginbotham', ...","[abc2, aussie, comedy, panel, entertainment, q...",2020-03-05 21:38:00,PG,2090.0,3,9.0,


In [11]:
# Write the concatenated data to a new file
if os.path.exists(directory_path + '/full_data.pkl'):
    os.remove(directory_path + '/full_data.pkl')

with open(directory_path + '/full_data.pkl', 'wb') as f:
    pickle.dump(clean, f)

In [12]:
films = clean.copy()

films = films[(films['series'].isna()) & (films['category'] != 'News') & (films['category'] != 'Panel Discussion')]

films = films.reset_index(drop = True)

films['id'] = films.index
cols = films.columns.tolist()
cols = cols[-1:] + cols[:-1]
films = films[cols]

films.head(3)

Unnamed: 0,episode_title,id,category,title,series,episode_name,description,description2,tags,image,directors_actors,tags2,publication_date,rating,duration_sec,season,episode
0,The Way We Were,0,Movies,The Way We Were,,The Way We Were,A love story that begins with the attraction o...,A love story that begins with the attraction o...,"[ABC TV, ABC TV Plus, DRAMA, MOVIES]",https://cdn.iview.abc.net.au/thumbs/i/zy/ZY933...,[['Director Sydney Pollack']],"[abc1, abc2, drama, classic, film, feature-len...",2021-06-25 07:00:00,PG,6802.0,,
1,Starman,1,Movies,Starman,,Starman,An alien stranded on Earth clones himself into...,An alien stranded on Earth clones himself into...,"[ABC TV, ABC TV Plus, DRAMA, MOVIES]",https://cdn.iview.abc.net.au/thumbs/i/zw/ZW301...,[['Director John Carpenter']],"[abc1, abc2, drama, sci-fi, fantasy, romance, ...",2021-06-25 07:00:00,PG,6612.0,,
2,Mr Deeds Goes To Town,2,Movies,Mr Deeds Goes To Town,,Mr Deeds Goes To Town,A small-town poet inherits a vast fortune and ...,A small-town poet inherits a vast fortune and ...,"[ABC TV, ABC TV Plus, DRAMA, MOVIES]",https://cdn.iview.abc.net.au/thumbs/i/zw/ZW302...,"[['Cast Gary Cooper', 'Jean Arthur', 'Douglas ...","[abc1, abc2, drama, comedy-drama, film, featur...",2021-06-25 07:00:00,G,6649.0,,


In [13]:
# Separating the films from the series
if os.path.exists(directory_path + '/full_movies.pkl'):
    os.remove(directory_path + '/full_movies.pkl')

with open(directory_path + '/full_movies.pkl', 'wb') as f:
    pickle.dump(films, f)

# Creating a new dataframe with only the series

In [14]:
series_raw = clean[(clean['series'].isna()==False)]

# assuming your original dataframe is called 'tv_series'
series_grouped = series_raw.groupby('title')

# create a new dataframe with the relevant information per TV series
series = pd.DataFrame({
    'category': series_grouped['category'].first(),
    'description': series_grouped['description'].first(),
    'description2': series_grouped['description2'].agg(list),
    'directors_actors': series_grouped['directors_actors'].sum().apply(set).apply(list),
    'publication_date': series_grouped['publication_date'].first(),
    'rating': series_grouped['rating'].first(),
    'duration_sec': series_grouped['duration_sec'].agg(list),
    'tags': series_grouped['tags'].first(),
    'tags2' : series_grouped['tags2'].sum().apply(set).apply(list),
    'image': series_grouped['image'].first()
})


series.reset_index(inplace = True)
series['id'] = series.index + 10_000
cols = series.columns.tolist()
cols = cols[-1:] + cols[:-1]
series = series[cols]

series.head(3)

Unnamed: 0,id,title,category,description,description2,directors_actors,publication_date,rating,duration_sec,tags,tags2,image
0,10000,199 Little Heroes,Education,The journey to school is a very special type o...,"[Enjo lives in Quinten, a forest glade in the ...",[[]],2021-06-01 06:20:00,G,"[311.0, 311.0, 312.0, 311.0, 311.0, 320.0, 311...","[ABC ME, EDUCATION]","[environment-day, refugee-day, primary-humanit...",https://cdn.iview.abc.net.au/thumbs/i/zw/ZW073...
1,10001,7.30,News,Leigh Sales presents Australia's leading night...,[The devastation caused by the recent floods i...,[[]],2022-03-14 20:00:00,,"[1906.0, 1894.0, 1879.0, 1935.0, 1891.0, 1822....","[ABC TV, ABC NEWS]","[business, 730, news, analysis, news24, invest...",https://cdn.iview.abc.net.au/thumbs/i/nc/NC220...
2,10002,7.30 Mark Humphries Satire,News,Satirist Mark Humphries brings his unique pers...,[Satirist Mark Humphries goes inside the Healt...,[[]],2021-05-11 14:00:00,,"[145.0, 110.0, 118.0, 136.0, 202.0, 115.0, 241...",[ABC NEWS],"[candidates, humphries, satire, news24, ley, p...",https://cdn.iview.abc.net.au/thumbs/i/nn/NN211...


In [15]:
# Write the concatenated data to a new file
if os.path.exists(directory_path + '/full_shows.pkl'):
    os.remove(directory_path + '/full_shows.pkl')

with open(directory_path + '/full_shows.pkl', 'wb') as f:
    pickle.dump(series, f)

In [16]:
pd.read_pickle(directory_path + '/full_shows.pkl').head(3)

Unnamed: 0,id,title,category,description,description2,directors_actors,publication_date,rating,duration_sec,tags,tags2,image
0,10000,199 Little Heroes,Education,The journey to school is a very special type o...,"[Enjo lives in Quinten, a forest glade in the ...",[[]],2021-06-01 06:20:00,G,"[311.0, 311.0, 312.0, 311.0, 311.0, 320.0, 311...","[ABC ME, EDUCATION]","[environment-day, refugee-day, primary-humanit...",https://cdn.iview.abc.net.au/thumbs/i/zw/ZW073...
1,10001,7.30,News,Leigh Sales presents Australia's leading night...,[The devastation caused by the recent floods i...,[[]],2022-03-14 20:00:00,,"[1906.0, 1894.0, 1879.0, 1935.0, 1891.0, 1822....","[ABC TV, ABC NEWS]","[business, 730, news, analysis, news24, invest...",https://cdn.iview.abc.net.au/thumbs/i/nc/NC220...
2,10002,7.30 Mark Humphries Satire,News,Satirist Mark Humphries brings his unique pers...,[Satirist Mark Humphries goes inside the Healt...,[[]],2021-05-11 14:00:00,,"[145.0, 110.0, 118.0, 136.0, 202.0, 115.0, 241...",[ABC NEWS],"[candidates, humphries, satire, news24, ley, p...",https://cdn.iview.abc.net.au/thumbs/i/nn/NN211...


In [17]:
# Just to check
pd.read_pickle(directory_path + '/full_movies.pkl').head(3)

Unnamed: 0,episode_title,id,category,title,series,episode_name,description,description2,tags,image,directors_actors,tags2,publication_date,rating,duration_sec,season,episode
0,The Way We Were,0,Movies,The Way We Were,,The Way We Were,A love story that begins with the attraction o...,A love story that begins with the attraction o...,"[ABC TV, ABC TV Plus, DRAMA, MOVIES]",https://cdn.iview.abc.net.au/thumbs/i/zy/ZY933...,[['Director Sydney Pollack']],"[abc1, abc2, drama, classic, film, feature-len...",2021-06-25 07:00:00,PG,6802.0,,
1,Starman,1,Movies,Starman,,Starman,An alien stranded on Earth clones himself into...,An alien stranded on Earth clones himself into...,"[ABC TV, ABC TV Plus, DRAMA, MOVIES]",https://cdn.iview.abc.net.au/thumbs/i/zw/ZW301...,[['Director John Carpenter']],"[abc1, abc2, drama, sci-fi, fantasy, romance, ...",2021-06-25 07:00:00,PG,6612.0,,
2,Mr Deeds Goes To Town,2,Movies,Mr Deeds Goes To Town,,Mr Deeds Goes To Town,A small-town poet inherits a vast fortune and ...,A small-town poet inherits a vast fortune and ...,"[ABC TV, ABC TV Plus, DRAMA, MOVIES]",https://cdn.iview.abc.net.au/thumbs/i/zw/ZW302...,"[['Cast Gary Cooper', 'Jean Arthur', 'Douglas ...","[abc1, abc2, drama, comedy-drama, film, featur...",2021-06-25 07:00:00,G,6649.0,,
