### Load the required libraries

In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import OneHotEncoder
from nltk.tokenize import word_tokenize, WhitespaceTokenizer
from nltk.corpus import wordnet
import nltk

  from .autonotebook import tqdm as notebook_tqdm


### Load the dataset, you need to login to your huggingface account for this

In [2]:
dataset = load_dataset('datadrivenscience/movie-genre-prediction')

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'movie_name', 'synopsis', 'genre'],
        num_rows: 54000
    })
    test: Dataset({
        features: ['id', 'movie_name', 'synopsis', 'genre'],
        num_rows: 36000
    })
})

In [4]:
dataset['train']

Dataset({
    features: ['id', 'movie_name', 'synopsis', 'genre'],
    num_rows: 54000
})

### Convert To Dataframe

In [5]:
df_train_all = pd.DataFrame(dataset['train'])

In [6]:
df_train_all.head(10)

Unnamed: 0,id,movie_name,synopsis,genre
0,44978,Super Me,A young scriptwriter starts bringing valuable ...,fantasy
1,50185,Entity Project,A director and her friends renting a haunted h...,horror
2,34131,Behavioral Family Therapy for Serious Psychiat...,This is an educational video for families and ...,family
3,78522,Blood Glacier,Scientists working in the Austrian Alps discov...,scifi
4,2206,Apat na anino,Buy Day - Four Men Widely - Apart in Life - By...,action
5,46636,Le démon dans l'île,"On an island, things start going crazy as the ...",horror
6,53777,Candid,A video voyeur stalks women in the city with a...,horror
7,24872,Hired,Twin brothers separated at birth and worlds ap...,crime
8,17224,Miruthan,A traffic police officer teams up with his fri...,adventure
9,54900,Until You See Me,A legendary tale unravels.,mystery


In [7]:
len(df_train_all)

54000

In [8]:
df_test_all = pd.DataFrame(dataset['test'])

In [9]:
df_test_all.head(10)

Unnamed: 0,id,movie_name,synopsis,genre
0,16863,A Death Sentence,"12 y.o. Ida's dad'll die without a DKK1,500,00...",action
1,48456,Intermedio,A group of four teenage friends become trapped...,action
2,41383,30 Chua Phai Tet,A guy left his home for 12 years till he came ...,action
3,84007,Paranoiac,A man long believed dead returns to the family...,action
4,40269,Ordinary Happiness,"After a deadly accident, Paolo comes back on E...",action
5,16524,The Syndicate,"An alcoholic, his wife, a down and out America...",action
6,21245,12 Depths,"""Struck by amnesia and framed for murder, a ma...",action
7,84681,The Feud,A mysterious man returns to his home in Iowa f...,action
8,73352,Cabin Fever: Patient Zero,When a group of friends enjoying a bachelor cr...,action
9,37966,Peregrina,"After a near death experience, a young woman b...",action


In [10]:
len(df_test_all)

36000

### Create The Samples

In [11]:
df_train = df_train_all.sample(n=1000)

In [12]:
df_train.head()

Unnamed: 0,id,movie_name,synopsis,genre
6555,9757,Who Framed Roger Rabbit 2,A sequel to Who Framed Roger Rabbit (1988).,adventure
46679,38702,Ru lai ba gua gun,Fifth brother is separated from his kin whilst...,fantasy
4936,37504,Me Two,"Ranu, a shy accountant, suddenly discovers a c...",fantasy
10906,26900,Killer's Moon,"Four mental patients - who, due to unauthorize...",crime
11691,84807,The Storm Thief,The storm robbery is part of the greatest art ...,thriller


In [13]:
df_rest = df_train_all.drop(labels=df_train.index)

In [14]:
df_validate = df_rest.sample(n=500)

In [15]:
df_validate.head()

Unnamed: 0,id,movie_name,synopsis,genre
22295,86502,The Care and Feeding of Exotic Pets,A kidnapper obsessed with reptiles abducts a m...,thriller
10717,2794,Hacker,Thirteen-year-old Benjamin discovers that his ...,action
5958,32836,Doraemon: Nobita and the Wind Wizard,A boy befriends a wind-child and goes on an ad...,family
39173,43609,Gye Nyame: The Akan War,Nothing was ever as it seemed.,fantasy
11022,43766,Pan Twardowski,"A young nobleman, pursued by Satan since child...",fantasy


In [16]:
df_test = df_test_all.sample(n=200)

In [17]:
df_test.head()

Unnamed: 0,id,movie_name,synopsis,genre
12332,79455,Sage,A troubled cop becomes the reluctant protector...,action
31326,76376,Psychonaut,"Enabled by a mysterious futuristic machine, Ma...",action
9799,44365,Shinema no tenshi,"A 122-year old movie theater, the Daikokuza, i...",action
7245,35255,Mai Mai Miracle,"Inspired by her grandfather's stories, young S...",action
5899,51673,Epitaph: Bread and Salt,"A family of Estrie hunters, unite with a demon...",action


### One Hot Encode The Genre

In [18]:
ohe = OneHotEncoder(sparse_output=False)

In [19]:
genre_encoded_train = ohe.fit_transform(df_train[['genre']])
genre_encoded_validate = ohe.fit_transform(df_validate[['genre']])
genre_encoded_test = ohe.fit_transform(df_test[['genre']])

In [20]:
genre_encoded_train.shape

(1000, 10)

In [21]:
genre_encoded_train[0]

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0.])

In [22]:
Y_train = genre_encoded_train.argmax(axis=1)
Y_validate = genre_encoded_validate.argmax(axis=1)
Y_test = genre_encoded_test.argmax(axis=1)

In [23]:
Y_train

array([1, 4, 4, 2, 9, 7, 6, 2, 4, 3, 2, 7, 6, 5, 9, 7, 4, 9, 2, 3, 7, 9,
       4, 4, 4, 8, 9, 6, 3, 1, 1, 3, 8, 8, 4, 7, 5, 1, 5, 9, 3, 2, 5, 0,
       0, 9, 4, 8, 7, 5, 0, 8, 4, 4, 0, 3, 6, 2, 0, 9, 1, 5, 8, 9, 8, 0,
       9, 9, 0, 6, 7, 2, 7, 1, 6, 1, 3, 2, 9, 9, 4, 1, 4, 5, 1, 3, 7, 7,
       7, 0, 4, 1, 3, 8, 8, 7, 0, 3, 3, 9, 2, 4, 3, 6, 5, 4, 0, 6, 3, 9,
       1, 6, 0, 2, 0, 3, 6, 1, 7, 3, 3, 4, 1, 7, 2, 4, 4, 8, 5, 8, 3, 5,
       4, 9, 2, 4, 9, 2, 4, 1, 9, 3, 4, 1, 3, 2, 8, 4, 1, 4, 3, 7, 6, 6,
       3, 7, 6, 1, 2, 5, 3, 8, 6, 4, 5, 0, 2, 4, 9, 7, 4, 4, 2, 6, 9, 4,
       0, 5, 1, 1, 5, 8, 5, 3, 7, 9, 4, 9, 5, 7, 2, 3, 5, 6, 0, 2, 7, 4,
       3, 9, 6, 5, 6, 0, 5, 8, 4, 5, 7, 7, 7, 6, 1, 7, 8, 9, 8, 8, 6, 1,
       2, 2, 3, 3, 9, 6, 7, 2, 3, 6, 2, 7, 5, 0, 2, 6, 3, 7, 4, 8, 7, 2,
       4, 4, 2, 7, 1, 0, 2, 7, 5, 1, 2, 1, 1, 9, 2, 6, 6, 0, 2, 1, 7, 4,
       7, 8, 1, 7, 9, 3, 3, 8, 3, 8, 9, 8, 3, 8, 5, 0, 4, 1, 4, 8, 2, 0,
       0, 9, 0, 6, 9, 4, 4, 0, 6, 5, 0, 6, 8, 2, 5,

In [24]:
len(Y_train)

1000

### Add Movie Name And Synopsis To One Column

In [25]:
df_train['text'] = df_train['movie_name'] + ' ' + df_train['synopsis']
df_validate['text'] = df_validate['movie_name'] + ' ' + df_validate['synopsis']
df_test['text'] = df_test['movie_name'] + ' ' + df_test['synopsis']

In [26]:
df_train.head(10)

Unnamed: 0,id,movie_name,synopsis,genre,text
6555,9757,Who Framed Roger Rabbit 2,A sequel to Who Framed Roger Rabbit (1988).,adventure,Who Framed Roger Rabbit 2 A sequel to Who Fram...
46679,38702,Ru lai ba gua gun,Fifth brother is separated from his kin whilst...,fantasy,Ru lai ba gua gun Fifth brother is separated f...
4936,37504,Me Two,"Ranu, a shy accountant, suddenly discovers a c...",fantasy,"Me Two Ranu, a shy accountant, suddenly discov..."
10906,26900,Killer's Moon,"Four mental patients - who, due to unauthorize...",crime,"Killer's Moon Four mental patients - who, due ..."
11691,84807,The Storm Thief,The storm robbery is part of the greatest art ...,thriller,The Storm Thief The storm robbery is part of t...
51741,66355,The Money,"Sief, is a con artist who's hired by the beaut...",romance,"The Money Sief, is a con artist who's hired by..."
28883,60282,Ashes of Paradise,A judge falls from the roof of the Federal Cou...,mystery,Ashes of Paradise A judge falls from the roof ...
13507,21610,Darwaza Bandh Rakho,Small-time crooks bungle a kidnapping attempt.,crime,Darwaza Bandh Rakho Small-time crooks bungle a...
41972,43895,Signal to Noise,Over a week-long stay at his childhood best fr...,fantasy,Signal to Noise Over a week-long stay at his c...
10581,34107,Shoulder to Shoulder: One Team's Legacy,A group of friends confront the realities of m...,family,Shoulder to Shoulder: One Team's Legacy A grou...


### Using Simple Count Vectorizer

In [27]:
df_train_all['text'] = df_train_all['movie_name'] + ' ' + df_train_all['synopsis']

In [28]:
count_vect = CountVectorizer()
count_vect.fit(df_train_all['text'])

In [29]:
len(count_vect.vocabulary_)

60678

In [30]:
X_train_count_vec = count_vect.transform(df_train['text'])
X_validate_count_vec = count_vect.transform(df_validate['text'])
X_test_count_vec = count_vect.transform(df_test['text'])

In [31]:
X_train_count_vec[0:5].toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### Running A Naive Bayes Model

In [32]:
nb_model = MultinomialNB().fit(X_train_count_vec, Y_train)

### Find The Accuracy

#### Train

In [33]:
predicted_train = nb_model.predict(X_train_count_vec)

In [34]:
np.mean(predicted_train == Y_train)

0.971

#### Validation

In [35]:
predicted_valid = nb_model.predict(X_validate_count_vec)

In [36]:
np.mean(predicted_valid == Y_validate)

0.212

#### Test

In [37]:
predicted_test = nb_model.predict(X_test_count_vec)

In [38]:
np.mean(predicted_test == Y_test)

0.075

### Use Random Forests

In [39]:
rf_model = RandomForestClassifier(n_estimators = 100).fit(X_train_count_vec, Y_train)

### Find The Accuracy

#### Train

In [40]:
predicted_train = rf_model.predict(X_train_count_vec)

In [41]:
np.mean(predicted_train == Y_train)

0.997

#### Validation

In [42]:
predicted_valid = rf_model.predict(X_validate_count_vec)

In [43]:
np.mean(predicted_valid == Y_validate)

0.224

#### Test

In [44]:
predicted_test = rf_model.predict(X_test_count_vec)

In [45]:
np.mean(predicted_test == Y_test)

0.09

### Remove Stop Words

In [46]:
count_vect = CountVectorizer(stop_words='english')
count_vect.fit(df_train_all['text'])

In [47]:
len(count_vect.vocabulary_)

60378

In [48]:
X_train_count_vec = count_vect.transform(df_train['text'])
X_validate_count_vec = count_vect.transform(df_validate['text'])
X_test_count_vec = count_vect.transform(df_test['text'])

### Rerun The Naive Bayes Model

In [49]:
nb_model = MultinomialNB().fit(X_train_count_vec, Y_train)

In [50]:
predicted_train = nb_model.predict(X_train_count_vec)
print('Train accuracy: ', np.mean(predicted_train == Y_train))
predicted_valid = nb_model.predict(X_validate_count_vec)
print('Validation accuracy: ', np.mean(predicted_valid == Y_validate))
predicted_test = nb_model.predict(X_test_count_vec)
print('Test Accuracy: ', np.mean(predicted_test == Y_test))

Train accuracy:  0.99
Validation accuracy:  0.268
Test Accuracy:  0.08


### Initilize the NLTK functions

In [51]:
wst = WhitespaceTokenizer()
lemma = nltk.wordnet.WordNetLemmatizer()

### Remove Punctuations & Numbers

In [52]:
def removePunctuationsAndNumbers(data, column):
    text_split = wst.tokenize(data[column])
    noPunctAndNumbers = ' '.join(word_tokenize(' '.join([''.join([char for char in word if char.isalpha()]).lower() for word in text_split])))
    return noPunctAndNumbers

In [53]:
df_train['processed'] = df_train.apply(removePunctuationsAndNumbers, axis=1, args=('text',))
df_validate['processed'] = df_validate.apply(removePunctuationsAndNumbers, axis=1, args=('text',))
df_test['processed'] = df_test.apply(removePunctuationsAndNumbers, axis=1, args=('text',))

In [54]:
df_train_all['processed'] = df_train_all.apply(removePunctuationsAndNumbers, axis=1, args=('text',))

In [55]:
df_train.head()

Unnamed: 0,id,movie_name,synopsis,genre,text,processed
6555,9757,Who Framed Roger Rabbit 2,A sequel to Who Framed Roger Rabbit (1988).,adventure,Who Framed Roger Rabbit 2 A sequel to Who Fram...,who framed roger rabbit a sequel to who framed...
46679,38702,Ru lai ba gua gun,Fifth brother is separated from his kin whilst...,fantasy,Ru lai ba gua gun Fifth brother is separated f...,ru lai ba gua gun fifth brother is separated f...
4936,37504,Me Two,"Ranu, a shy accountant, suddenly discovers a c...",fantasy,"Me Two Ranu, a shy accountant, suddenly discov...",me two ranu a shy accountant suddenly discover...
10906,26900,Killer's Moon,"Four mental patients - who, due to unauthorize...",crime,"Killer's Moon Four mental patients - who, due ...",killers moon four mental patients who due to u...
11691,84807,The Storm Thief,The storm robbery is part of the greatest art ...,thriller,The Storm Thief The storm robbery is part of t...,the storm thief the storm robbery is part of t...


#### Retrain The CountVectorizer

In [56]:
count_vect = CountVectorizer(stop_words='english')
count_vect.fit(df_train_all['processed'])

In [57]:
len(count_vect.vocabulary_)

66659

In [58]:
X_train_count_vec = count_vect.transform(df_train['text'])
X_validate_count_vec = count_vect.transform(df_validate['text'])
X_test_count_vec = count_vect.transform(df_test['text'])

### Rerun The NB Model

In [59]:
nb_model = MultinomialNB().fit(X_train_count_vec, Y_train)

In [60]:
predicted_train = nb_model.predict(X_train_count_vec)
print('Train accuracy: ', np.mean(predicted_train == Y_train))
predicted_valid = nb_model.predict(X_validate_count_vec)
print('Validation accuracy: ', np.mean(predicted_valid == Y_validate))
predicted_test = nb_model.predict(X_test_count_vec)
print('Test Accuracy: ', np.mean(predicted_test == Y_test))

Train accuracy:  0.989
Validation accuracy:  0.266
Test Accuracy:  0.08


### Lemmatize Text

In [61]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatizeText(data, column):
    text = data[column]
    word_tokens = word_tokenize(text)
    lemmatized_words = [lemma.lemmatize(word, get_wordnet_pos(word)) for word in word_tokens]
    return ' '.join(lemmatized_words)

In [62]:
df_train['processed'] = df_train.apply(lemmatizeText, axis=1, args=('processed',))
df_validate['processed'] = df_validate.apply(lemmatizeText, axis=1, args=('processed',))
df_test['processed'] = df_test.apply(lemmatizeText, axis=1, args=('processed',))

In [63]:
df_train_all['processed'] = df_train_all.apply(lemmatizeText, axis=1, args=('processed',))

In [64]:
df_train.head()

Unnamed: 0,id,movie_name,synopsis,genre,text,processed
6555,9757,Who Framed Roger Rabbit 2,A sequel to Who Framed Roger Rabbit (1988).,adventure,Who Framed Roger Rabbit 2 A sequel to Who Fram...,who frame roger rabbit a sequel to who frame r...
46679,38702,Ru lai ba gua gun,Fifth brother is separated from his kin whilst...,fantasy,Ru lai ba gua gun Fifth brother is separated f...,ru lai ba gua gun fifth brother be separate fr...
4936,37504,Me Two,"Ranu, a shy accountant, suddenly discovers a c...",fantasy,"Me Two Ranu, a shy accountant, suddenly discov...",me two ranu a shy accountant suddenly discover...
10906,26900,Killer's Moon,"Four mental patients - who, due to unauthorize...",crime,"Killer's Moon Four mental patients - who, due ...",killer moon four mental patient who due to una...
11691,84807,The Storm Thief,The storm robbery is part of the greatest art ...,thriller,The Storm Thief The storm robbery is part of t...,the storm thief the storm robbery be part of t...


In [65]:
count_vect = CountVectorizer(stop_words='english')
count_vect.fit(df_train_all['processed'])

In [66]:
len(count_vect.vocabulary_)

58233

In [67]:
X_train_count_vec = count_vect.transform(df_train['text'])
X_validate_count_vec = count_vect.transform(df_validate['text'])
X_test_count_vec = count_vect.transform(df_test['text'])

### Rerun The NB Model

In [68]:
nb_model = MultinomialNB().fit(X_train_count_vec, Y_train)

In [69]:
predicted_train = nb_model.predict(X_train_count_vec)
print('Train accuracy: ', np.mean(predicted_train == Y_train))
predicted_valid = nb_model.predict(X_validate_count_vec)
print('Validation accuracy: ', np.mean(predicted_valid == Y_validate))
predicted_test = nb_model.predict(X_test_count_vec)
print('Test Accuracy: ', np.mean(predicted_test == Y_test))

Train accuracy:  0.975
Validation accuracy:  0.266
Test Accuracy:  0.065


### Assignment

- Try out different permutations of feature engineering and see how it effects the results
- Try out different models and see how it effects the accuracy
- Try out different model hyper parameters and see how it effects the accuracy