In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

  from .autonotebook import tqdm as notebook_tqdm


### Load the dataset, you need to login to your huggingface account for this

In [3]:
dataset = load_dataset('datadrivenscience/movie-genre-prediction')

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'movie_name', 'synopsis', 'genre'],
        num_rows: 54000
    })
    test: Dataset({
        features: ['id', 'movie_name', 'synopsis', 'genre'],
        num_rows: 36000
    })
})

In [5]:
dataset['train']

Dataset({
    features: ['id', 'movie_name', 'synopsis', 'genre'],
    num_rows: 54000
})

### Convert To Dataframe

In [6]:
df_train_all = pd.DataFrame(dataset['train'])

In [7]:
df_train_all.head(10)

Unnamed: 0,id,movie_name,synopsis,genre
0,44978,Super Me,A young scriptwriter starts bringing valuable ...,fantasy
1,50185,Entity Project,A director and her friends renting a haunted h...,horror
2,34131,Behavioral Family Therapy for Serious Psychiat...,This is an educational video for families and ...,family
3,78522,Blood Glacier,Scientists working in the Austrian Alps discov...,scifi
4,2206,Apat na anino,Buy Day - Four Men Widely - Apart in Life - By...,action
5,46636,Le démon dans l'île,"On an island, things start going crazy as the ...",horror
6,53777,Candid,A video voyeur stalks women in the city with a...,horror
7,24872,Hired,Twin brothers separated at birth and worlds ap...,crime
8,17224,Miruthan,A traffic police officer teams up with his fri...,adventure
9,54900,Until You See Me,A legendary tale unravels.,mystery


In [8]:
len(df_train_all)

54000

In [9]:
df_test_all = pd.DataFrame(dataset['test'])

In [10]:
df_test_all.head(10)

Unnamed: 0,id,movie_name,synopsis,genre
0,16863,A Death Sentence,"12 y.o. Ida's dad'll die without a DKK1,500,00...",action
1,48456,Intermedio,A group of four teenage friends become trapped...,action
2,41383,30 Chua Phai Tet,A guy left his home for 12 years till he came ...,action
3,84007,Paranoiac,A man long believed dead returns to the family...,action
4,40269,Ordinary Happiness,"After a deadly accident, Paolo comes back on E...",action
5,16524,The Syndicate,"An alcoholic, his wife, a down and out America...",action
6,21245,12 Depths,"""Struck by amnesia and framed for murder, a ma...",action
7,84681,The Feud,A mysterious man returns to his home in Iowa f...,action
8,73352,Cabin Fever: Patient Zero,When a group of friends enjoying a bachelor cr...,action
9,37966,Peregrina,"After a near death experience, a young woman b...",action


In [11]:
len(df_test_all)

36000

### Create The Samples

In [12]:
df_train = df_train_all.sample(n=1000)

In [13]:
df_train.head()

Unnamed: 0,id,movie_name,synopsis,genre
13264,64982,Jeena Marna Tere Sang,Amar is the son of a millionaire. He decides t...,romance
10982,53192,Orochi,The story of Orochi revolves around the life o...,horror
46978,54388,Alien Code,After deciphering a message found in a satelli...,mystery
8427,53892,Ga-ga: Glory to the Heroes,"In the 21st century, prisoners aboard penitent...",horror
2285,1071,Rasigan,Vijay and Anitha secretly love each other but ...,action


In [14]:
df_rest = df_train_all.drop(labels=df_train.index)

In [15]:
df_validate = df_rest.sample(n=500)

In [16]:
df_validate.head()

Unnamed: 0,id,movie_name,synopsis,genre
52253,43524,Blood Brothers,"Two vampire friends, Nick and Tree, unite to p...",fantasy
32773,56278,Spirit Cabinet,Every ghost story is a mystery. Eight people s...,mystery
3836,15584,Mission Kill,An ex-Green Beret visits one of his army buddi...,adventure
12014,41468,Fantomas Unleashed,"When Professor Marchand, a famous scientist, m...",fantasy
7951,89988,Tragic Hero,"After being released from prison, Tang Kat Yun...",thriller


In [17]:
df_test = df_test_all.sample(n=200)

In [18]:
df_test.head()

Unnamed: 0,id,movie_name,synopsis,genre
34709,41381,Sea Prince and the Fire Child,A prince of the sea and a child of fire engage...,action
32962,69766,"Red Roses, Call for a Girl",UNSUSPECTING YOUNG GIRLS ARE SOLD TO WHITE SLA...,action
21790,69054,Hillbilly Blitzkrieg,Nazi spies mistake Snuffy Smith's moonshine fo...,action
16780,44352,63,A sixty-three year old high school substitute ...,action
11137,9114,The Reckless Way,A woman finds work as a model and takes advant...,action


### One Hot Encode The Genre

In [22]:
ohe = OneHotEncoder(sparse_output=False)

In [23]:
genre_encoded_train = ohe.fit_transform(df_train[['genre']])
genre_encoded_validate = ohe.fit_transform(df_validate[['genre']])
genre_encoded_test = ohe.fit_transform(df_test[['genre']])

In [24]:
genre_encoded_train.shape

(1000, 10)

In [25]:
genre_encoded_train[0]

array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0.])

In [26]:
Y_train = genre_encoded_train.argmax(axis=1)
Y_validate = genre_encoded_validate.argmax(axis=1)
Y_test = genre_encoded_test.argmax(axis=1)

In [27]:
len(Y_train)

1000

### Add Movie Name And Synopsis To One Column

In [28]:
df_train['text'] = df_train['movie_name'] + ' ' + df_train['synopsis']
df_validate['text'] = df_validate['movie_name'] + ' ' + df_validate['synopsis']
df_test['text'] = df_test['movie_name'] + ' ' + df_test['synopsis']

In [29]:
df_train.head(10)

Unnamed: 0,id,movie_name,synopsis,genre,text
13264,64982,Jeena Marna Tere Sang,Amar is the son of a millionaire. He decides t...,romance,Jeena Marna Tere Sang Amar is the son of a mil...
10982,53192,Orochi,The story of Orochi revolves around the life o...,horror,Orochi The story of Orochi revolves around the...
46978,54388,Alien Code,After deciphering a message found in a satelli...,mystery,Alien Code After deciphering a message found i...
8427,53892,Ga-ga: Glory to the Heroes,"In the 21st century, prisoners aboard penitent...",horror,Ga-ga: Glory to the Heroes In the 21st century...
2285,1071,Rasigan,Vijay and Anitha secretly love each other but ...,action,Rasigan Vijay and Anitha secretly love each ot...
33286,75692,Meta Runner Season 1 (Full Movie Cut),In celebration of Season 2 being released soon...,scifi,Meta Runner Season 1 (Full Movie Cut) In celeb...
44082,66085,The Witness Chair,A woman watches as false murder-trial evidence...,romance,The Witness Chair A woman watches as false mur...
28609,28432,Funny Pets 2,Pets sit and discuss what it's like being a pe...,family,Funny Pets 2 Pets sit and discuss what it's li...
34997,37669,Satanic Verses: A New Era,Action Thriller of Biblical Fiction that shows...,fantasy,Satanic Verses: A New Era Action Thriller of B...
21200,80614,The Chronic Argonauts,"Adaptation of the graphic novel, ""The Chronic ...",scifi,The Chronic Argonauts Adaptation of the graphi...


### Using TFIDF

In [31]:
df_train_all['text'] = df_train_all['movie_name'] + ' ' + df_train_all['synopsis']

In [32]:
tfidf_vect = TfidfVectorizer(stop_words='english')
tfidf_vect.fit(df_train_all['text'])

In [33]:
len(tfidf_vect.vocabulary_)

60378

In [34]:
X_train_tfidf_vec = tfidf_vect.transform(df_train['text'])
X_validate_tfidf_vec = tfidf_vect.transform(df_validate['text'])
X_test_tfidf_vec = tfidf_vect.transform(df_test['text'])

In [35]:
X_train_tfidf_vec[0:5].toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Running A Naive Bayes Model & Finding The Accuracy

In [39]:
nb_model = MultinomialNB().fit(X_train_tfidf_vec, Y_train)

In [42]:
predicted_train = nb_model.predict(X_train_tfidf_vec)
print('Train accuracy: ', np.mean(predicted_train == Y_train))
predicted_valid = nb_model.predict(X_validate_tfidf_vec)
print('Validation accuracy: ', np.mean(predicted_valid == Y_validate))
predicted_test = nb_model.predict(X_test_tfidf_vec)
print('Test Accuracy: ', np.mean(predicted_test == Y_test))

Train accuracy:  0.995
Validation accuracy:  0.212
Test Accuracy:  0.01


### Use Random Forests

In [45]:
rf_model = RandomForestClassifier(n_estimators = 100).fit(X_train_tfidf_vec, Y_train)

In [46]:
predicted_train = rf_model.predict(X_train_tfidf_vec)
print('Train accuracy: ', np.mean(predicted_train == Y_train))
predicted_valid = rf_model.predict(X_validate_tfidf_vec)
print('Validation accuracy: ', np.mean(predicted_valid == Y_validate))
predicted_test = rf_model.predict(X_test_tfidf_vec)
print('Test Accuracy: ', np.mean(predicted_test == Y_test))

Train accuracy:  0.997
Validation accuracy:  0.212
Test Accuracy:  0.12


### Increase The Data Set Size, Train & Predict On Whole Dataset

In [47]:
ohe = OneHotEncoder(sparse_output=False)

In [48]:
genre_encoded_train_all = ohe.fit_transform(df_train_all[['genre']])
genre_encoded_test_all = ohe.fit_transform(df_test_all[['genre']])

In [49]:
Y_train = genre_encoded_train_all.argmax(axis=1)
Y_test = genre_encoded_test_all.argmax(axis=1)

In [50]:
len(Y_train)

54000

In [60]:
df_train_all['text'] = df_train_all['movie_name'] + ' ' + df_train_all['synopsis']
df_test_all['text'] = df_test_all['movie_name'] + ' ' + df_test_all['synopsis']

In [61]:
tfidf_vect = TfidfVectorizer(stop_words='english')
tfidf_vect.fit(df_train_all['text'])

In [62]:
X_train_all_tfidf_vec = tfidf_vect.transform(df_train_all['text'])
X_test_all_tfidf_vec = tfidf_vect.transform(df_test_all['text'])

In [63]:
X_train_all_tfidf_vec.shape

(54000, 60378)

In [64]:
%%time
nb_model = MultinomialNB().fit(X_train_all_tfidf_vec, Y_train)

CPU times: user 33.9 ms, sys: 5.66 ms, total: 39.6 ms
Wall time: 37.3 ms


In [65]:
predicted_train = nb_model.predict(X_train_all_tfidf_vec)
print('Train accuracy: ', np.mean(predicted_train == Y_train))
predicted_test = nb_model.predict(X_test_all_tfidf_vec)
print('Test Accuracy: ', np.mean(predicted_test == Y_test))

Train accuracy:  0.6105185185185186
Test Accuracy:  0.08141666666666666


In [68]:
%%time
rf_model = RandomForestClassifier(n_estimators = 1000, n_jobs=-1, verbose=1).fit(X_train_all_tfidf_vec, Y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 13.6min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 24.4min


CPU times: user 3h 34min 51s, sys: 1min 3s, total: 3h 35min 54s
Wall time: 30min 53s


[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 30.9min finished


In [69]:
predicted_train = nb_model.predict(X_train_all_tfidf_vec)
print('Train accuracy: ', np.mean(predicted_train == Y_train))
predicted_test = nb_model.predict(X_test_all_tfidf_vec)
print('Test Accuracy: ', np.mean(predicted_test == Y_test))

Train accuracy:  0.6105185185185186
Test Accuracy:  0.08141666666666666
