Before running this, genres.zip needs to be unzipped to put genres.list in Datasets, and the plot.part files need to be unzipped to put plot.list in the Datasets folder. Once those files are present, then this can run properly. (Thanks to Github issues with very large files)

In [32]:
import pandas as pd

Let's get the genres.list file from the Datasets folder

In [33]:
# Let's get the genres.list file from the Datasets folder
# Each line has the name of the media and one genre
# A single piece of media can have multiple genres, so the same media can appear multiple times
# We'll deal with that later, for now let's just pull in the data
# This has been edited from the original version to remove duplicated tab characters that were causing issues

genredf = pd.read_csv('Datasets/genres.list', sep='\t', encoding='latin1', header=None, names=['Title', 'Genre'])
print(genredf.shape)
print(genredf.head())

(2658941, 2)
                   Title        Genre
0          !Next? (1994)  Documentary
1       #1 Single (2006)   Reality-TV
2  #15SecondScare (2015)       Horror
3  #15SecondScare (2015)        Short
4  #15SecondScare (2015)     Thriller


In [34]:
# Let's see how many unique genres we have
print(genredf['Genre'].nunique())

# Put all the unique genres into a list
genrelist = genredf['Genre'].unique()

# Let's see what the list looks like
print(genrelist)

# And let's see how many unique titles we have
print(genredf['Title'].nunique())

# No, not going to print all the titles, that would be dumb

36
['Documentary' 'Reality-TV' 'Horror' 'Short' 'Thriller' 'Drama' 'Crime'
 'Comedy' 'Talk-Show' 'Musical' 'Romance' 'Family' 'Mystery' 'Action'
 'News' 'Sport' 'Sci-Fi' 'Biography' 'Adventure' 'History' 'Music'
 'Game-Show' 'War' 'Fantasy' 'Animation' 'Adult' 'Western' 'Reality-tv'
 'Lifestyle' 'Sci-fi' 'Film-Noir' 'Hardcore' 'Sex' 'Experimental'
 'Commercial' 'Erotica']
1466038


In [35]:
# If something is an episode of a TV show, it will have a section between {} in the title
# We can use this to filter out the episodes
# Let's see how many titles have this
print(genredf['Title'].str.contains('{').sum())

# Create a new dataframe that only contains titles that don't have a { in them
moviegenredf = genredf[~genredf['Title'].str.contains('{')]
print(moviegenredf.shape)
# Let's see how many unique titles we have now
print(moviegenredf['Title'].nunique())

21165
(2637776, 2)
1451587


In [36]:
# Get the plot.list file from the Datasets folder
# Each line has the name of the media and a plot summary

plotdf = pd.read_csv('Datasets/plot.list', sep='\t', encoding='latin1', header=None, names=['Title', 'Plot'], on_bad_lines='skip')
print(plotdf.shape)
print(plotdf.head())

(600698, 2)
                                           Title  \
0                             #7DaysLater (2013)   
1  #BlackLove (2015) {Bringing Sexy Back (#1.3)}   
2     #BlackLove (2015) {Crash the Party (#1.9)}   
3       #BlackLove (2015) {Like a Virgin (#1.4)}   
4          #BlackLove (2015) {Maybe Baby (#1.8)}   

                                                Plot  
0  #7dayslater is an interactive comedy series fe...  
1  This week, the five women work on getting what...  
2  With just one week left in the workshops, the ...  
3  All of the women are struggling with what make...  
4  As the women focus on what commitment means to...  


In [37]:
# If something is an episode of a TV show, it will have a section between {} in the title
# We can use this to filter out the episodes
# Let's see how many titles have this
print(plotdf['Title'].str.contains('{').sum())

# Create a new dataframe that only contains titles that don't have a { in them
movieplotdf = plotdf[~plotdf['Title'].str.contains('{')]
print(movieplotdf.shape)

244469
(356229, 2)


In [38]:
# Now, let's merge the two dataframes on the Title column so we have a single dataframe with the genre and plot
# Any titles that are in one dataframe but not the other will be dropped
mergedf = pd.merge(moviegenredf, movieplotdf, on='Title', how='inner')
print(mergedf.shape)
print(mergedf.head())

# Let's see how many unique titles we have now
print(mergedf['Title'].nunique())

(744545, 3)
                Title   Genre  \
0  #7DaysLater (2013)  Comedy   
1        #Cake (2015)  Comedy   
2  #DaddyLeaks (????)  Comedy   
3      #Elmira (2014)  Comedy   
4        #Fuga (2016)  Action   

                                                Plot  
0  #7dayslater is an interactive comedy series fe...  
1  #CAKE is a hour-long serial narrative comedy a...  
2  The life of four close friends in their late t...  
3  #Elmira follows the story of a bunch of strang...  
4  Months after an apocalyptic event, a group of ...  
344291


In [39]:
# A list of all the unique genres
genrelist = mergedf['Genre'].unique()

print(genrelist)

['Comedy' 'Action' 'Drama' 'Horror' 'Family' 'Sci-Fi' 'Romance'
 'Biography' 'Crime' 'Reality-TV' 'Talk-Show' 'Music' 'Game-Show'
 'Adventure' 'War' 'Sport' 'Documentary' 'Animation' 'Fantasy' 'Mystery'
 'Thriller' 'Short' 'History' 'News' 'Adult' 'Musical' 'Western'
 'Lifestyle' 'Reality-tv' 'Film-Noir' 'Hardcore' 'Sex']


In [40]:
# That is the same as before, so we haven't lost any genres
# So, here's the issue: some titles have multiple genres and are listed multiple times, but we don't want to lose any of the genres
# So, instead of a single genre column, we'll create a column for each genre
# If a title has that genre, the column will be 1, otherwise it will be 0 - one-hot encoding
# A lot of columns, but that's fine

# Create a new dataframe with the one-hot encoded genres
onehotdf = pd.get_dummies(mergedf['Genre'])

# Add the Title column back in
onehotdf['Title'] = mergedf['Title']

# Add the Plot column back in
onehotdf['Plot'] = mergedf['Plot']

# Reorder the columns so Title and Plot are first
cols = onehotdf.columns.tolist()
cols = cols[-2:] + cols[:-2]
onehotdf = onehotdf[cols]

# Combine any rows with the same title, adding the one-hot encoded genres together but keeping the title and plot
# If any of the genres are greater than 1, set them to 1
onehotdf = onehotdf.groupby('Title').sum().reset_index()

print(onehotdf.shape)
print(onehotdf.head())

(344291, 34)
                                    Title  \
0                             # (2012/II)   
1                               #1 (2018)   
2          #1 Cheerleader Camp (2010) (V)   
3                 #1 Serial Killer (2013)   
4  #1 at the Apocalypse Box Office (2015)   

                                                Plot  Action  Adult  \
0  The night falls on the big city and a hooded f...       0      0   
1  After reaching #1 at Mutual of New York and se...       0      0   
2  When they're hired to work at a cheerleading c...       0      0   
3  Years of seething rage against the racism he's...       0      0   
4  Jules is, self declared, the most useless pers...       0      0   

   Adventure  Animation  Biography  Comedy  Crime  Documentary  ...  \
0          0          1          0       0      0            0  ...   
1          0          0          0       0      0            1  ...   
2          0          0          0       1      0            0  ...   
3    

In [41]:
# Print the number of total rows in the dataframe
print(onehotdf.shape[0])

# Print the number of unique titles in the dataframe
print(onehotdf['Title'].nunique())

# Find the number of titles that have more than one genre
print(onehotdf[onehotdf.columns[2:]].sum(axis=1).gt(1).sum())

# Find the number of titles that have only one genre
print(onehotdf[onehotdf.columns[2:]].sum(axis=1).eq(1).sum())

# Merge together the 'Reality-TV' and 'Reality-tv' columns into a single column
onehotdf['Reality-TV'] = onehotdf['Reality-TV'] + onehotdf['Reality-tv']

# Drop the 'Reality-tv' column
onehotdf.drop(columns=['Reality-tv'], inplace=True)

# Identify any titles where the genre values are greater than 1
# If any of the genres are greater than 1, set them to 1
onehotdf[onehotdf.columns[2:]] = onehotdf[onehotdf.columns[2:]].clip(upper=1)

# Print the number of columns in the dataframe
print(onehotdf.shape[1])

# If any columns have a sum of 0, remove them
onehotdf = onehotdf.loc[:, (onehotdf != 0).any(axis=0)]

# Print the number of columns in the dataframe
print(onehotdf.shape[1])

344291
344291
232218
112073
33
33


In [42]:
# Write the onehot dataframe to a CSV file
onehotdf.to_csv('Datasets/onehotplotgenre.csv', index=False)