In [116]:
import os
import pandas as pd
import numpy as np
from tqdm  import tqdm
import pickle
import re
import glob

# Data preparation

In [117]:
# Set the directory path where the pickle files are located
current_dir = os.getcwd()  # gets current working directory
project_dir = os.path.dirname(current_dir)  
directory_path = os.path.dirname(current_dir) + os.sep + 'data' + os.sep

# Get a list of all the pickle files in the directory
pickle_files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('.pkl') and f.startswith('full') == False ]

# Load and concatenate the data from each file
# create the empty dataframe with 13 columns

concatenated_data = []
for file in tqdm(pickle_files):
    with open(file, 'rb') as f:
        staged_data = pickle.load(f)
        concatenated_data.append(staged_data)

data = pd.concat(concatenated_data, axis=0)
# adding id
data['id'] = data.index
cols = data.columns.tolist()
cols = cols[-1:] + cols[:-1]
data = data[cols]

100%|██████████| 10/10 [00:00<00:00, 240.88it/s]


In [118]:
# arts is incorrectly labelled as comedy, also good naming, and missing values

data['category'].replace('Comedy', 'Arts', inplace = True)
data.rename({'more':'directors_actors'}, axis = 1, inplace = True)
data.replace(['No data found', 'No more information found', 
              'No title found', 'No tags found', 'No image found', 'No description found'], np.nan, inplace = True)

data['directors_actors'] = data['directors_actors'].apply(str).apply(lambda x : x.split(', ') if x != 'nan' else [])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13885 entries, 0 to 2528
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                13885 non-null  int64  
 1   category          13885 non-null  object 
 2   title             13883 non-null  object 
 3   series            13286 non-null  object 
 4   episode_name      13874 non-null  object 
 5   description       13883 non-null  object 
 6   description2      13880 non-null  object 
 7   tags              13883 non-null  object 
 8   image             13883 non-null  object 
 9   directors_actors  13885 non-null  object 
 10  tags2             13880 non-null  object 
 11  publication_date  13880 non-null  object 
 12  rating            11948 non-null  object 
 13  duration_sec      13879 non-null  float64
dtypes: float64(1), int64(1), object(12)
memory usage: 1.6+ MB


In [119]:
cols = list(data.columns)
# dropping nans where it is necessary
unwanted = ['directors_actors', 'rating', 'series', 'directors_actors']
cols = [x for x in cols if x not in unwanted]

clean = data.dropna(subset = cols)
clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13873 entries, 0 to 2528
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                13873 non-null  int64  
 1   category          13873 non-null  object 
 2   title             13873 non-null  object 
 3   series            13279 non-null  object 
 4   episode_name      13873 non-null  object 
 5   description       13873 non-null  object 
 6   description2      13873 non-null  object 
 7   tags              13873 non-null  object 
 8   image             13873 non-null  object 
 9   directors_actors  13873 non-null  object 
 10  tags2             13873 non-null  object 
 11  publication_date  13873 non-null  object 
 12  rating            11946 non-null  object 
 13  duration_sec      13873 non-null  float64
dtypes: float64(1), int64(1), object(12)
memory usage: 1.6+ MB


In [120]:
films = clean[(clean['series'].isna()) & (clean['category'] != 'News')]
films.shape

(562, 14)

# Creating special columns for episodes

In [121]:
seasons = []
eps = []
ep_descs = []

for index, row in clean.iterrows():
    i = index
    txt = row.episode_name
    try:
        season = re.findall('Series [0-9]{1,2}', txt)[0]
        season = re.findall('[0-9]+', season)[0]
        seasons.append(season)
    except:
        seasons.append(np.nan)
    try: 
        ep = re.findall('Episode [0-9]{1,2}', txt)[0]
        ep = re.findall('[0-9]+', ep)[0]
        eps.append(ep)
    except:
        eps.append('')
    desc = re.sub('[0-9]+', '', txt)
    desc = desc.replace('Series', '')
    desc = desc.replace('Episode', '')
    ep_descs.append(desc)

clean = clean.assign(season=seasons, episode=eps, episode_title=ep_descs)
clean.head(3)

Unnamed: 0,id,category,title,series,episode_name,description,description2,tags,image,directors_actors,tags2,publication_date,rating,duration_sec,season,episode,episode_title
0,0,Panel Discussion,Spicks And Specks,Spicks And Specks,"Series 9 Missy Higgins, Dave O'Neil, Yeo & Jud...","Join Adam Hills, Myf Warhurst and Alan Brough ...",Missy Higgins and Dave O'Neil help Alan battle...,"[ABC TV, PANEL & DISCUSSION]",https://cdn.iview.abc.net.au/thumbs/i/le/LE200...,"[Hosts Adam Hills, Myf Warhurst, Alan Brough]","[abc1, australia, panel, music, culture, enter...",2021-06-20 20:27:00,PG,2701.0,9,,"Missy Higgins, Dave O'Neil, Yeo & Judith Lucy"
1,1,Panel Discussion,Would I Lie To You?,Would I Lie To You?,Series 13 Episode 4,Rob Brydon is back in the host's chair for ano...,Host Rob Brydon and team captains Lee Mack and...,"[ABC TV Plus, COMEDY, PANEL & DISCUSSION]",https://cdn.iview.abc.net.au/thumbs/i/zw/ZW213...,[Host Rob Brydon],"[abc2, uk, comedy, panel, entertainment, cult-...",2022-02-08 20:30:35,PG,1737.0,13,4.0,
2,2,Panel Discussion,Whovians,Whovians,Series 3 Episode 9,"Join Rove McManus, Tegan Higginbotham and Bajo...","Rove McManus, Tegan Higginbotham and Bajo are ...","[ABC TV Plus, COMEDY, PANEL & DISCUSSION]",https://cdn.iview.abc.net.au/thumbs/i/le/LE191...,"[Hosts Rove McManus, Tegan Higginbotham, Steve...","[abc2, aussie, comedy, panel, entertainment, q...",2020-03-05 21:38:00,PG,2090.0,3,9.0,


In [122]:
# Write the concatenated data to a new file
if os.path.exists(directory_path + 'full_data.pkl'):
    os.remove(directory_path + 'full_data.pkl')

with open(directory_path + 'full_data.pkl', 'wb') as f:
    pickle.dump(clean, f)

In [128]:
# Separating the films from the series
films = clean[(clean['series'].isna()) & (clean['category'] != 'News') & (clean['category'] != 'Panel Discussion')]

if os.path.exists(directory_path + 'full_movies.pkl'):
    os.remove(directory_path + 'full_movies.pkl')

with open(directory_path + 'full_movies.pkl', 'wb') as f:
    pickle.dump(films, f)

# Creating a new dataframe with only the series

In [125]:
series_raw = clean[(clean['series'].isna()==False)]

# assuming your original dataframe is called 'tv_series'
series_grouped = series_raw.groupby('title')

# create a new dataframe with the relevant information per TV series
series = pd.DataFrame({
    'category': series_grouped['category'].first(),
    'description': series_grouped['description'].first(),
    'description2': series_grouped['description2'].agg(list),
    'directors_actors': series_grouped['directors_actors'].sum().apply(set).apply(list),
    'publication_date': series_grouped['publication_date'].first(),
    'rating': series_grouped['rating'].first(),
    'duration_sec': series_grouped['duration_sec'].agg(list),
    'tags': series_grouped['tags'].first(),
    'tags2' : series_grouped['tags2'].sum().apply(set).apply(list)
})

In [126]:
# Write the concatenated data to a new file
if os.path.exists(directory_path + 'full_shows.pkl'):
    os.remove(directory_path + 'full_shows.pkl')

with open(directory_path + 'full_shows.pkl', 'wb') as f:
    pickle.dump(series, f)