# Movie genre classification from description

In [40]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors

from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

In [11]:
text_file = open("train_data.txt", "r")
for i, line in enumerate(text_file):
    print(line)
    if i == 4:
        break

1 ::: Oscar et la dame rose (2009) ::: drama ::: Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone except straight-talking Rose, the lady in pink he meets on the hospital stairs. As Christmas approaches, Rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow Oscar to live life and love to the full, in the company of his friends Pop Corn, Einstein, Bacon and childhood sweetheart Peggy Blue.

2 ::: Cupid (1997) ::: thriller ::: A brother and sister with a past incestuous relationship have a current murderous relationship. He murders the women who reject him and she murders the women who get too close to him.

3 ::: Young, Wild and Wonderful (1980) ::: adult ::: As the bus empties the students for their field trip to the Museum of Natural History, little does the tour guide suspect that the

In [12]:
df = pd.read_csv('train_data.txt', sep=':::', header=None, engine='python')
df = df.drop(0, axis=1)
df.columns = ['name', 'class', 'description']
df.head()

Unnamed: 0,name,class,description
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


# Class balance - Downsample

In [13]:
# Cap the number of rows per class to 1000
df = df.groupby('class').head(2000).reset_index(drop=True)

df['class'].value_counts()

class
 drama           2000
 documentary     2000
 comedy          2000
 horror          2000
 short           2000
 thriller        1591
 action          1315
 western         1032
 reality-tv       884
 family           784
 adventure        775
 music            731
 romance          672
 sci-fi           647
 adult            590
 crime            505
 animation        498
 sport            432
 talk-show        391
 fantasy          323
 mystery          319
 musical          277
 biography        265
 history          243
 game-show        194
 news             181
 war              132
Name: count, dtype: int64

In [14]:
# Load the pretrained word2vec model
word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  

In [19]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/tamara/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [20]:
# Tokenize the description data
df['tokens'] = df['description'].apply(word_tokenize)

# Convert the tokens to word embeddings using the pretrained model
def get_word_embeddings(tokens):
    embeddings = []
    for token in tokens:
        if token in word_vectors.key_to_index:
            embeddings.append(word_vectors[token])
    return np.mean(embeddings, axis=0)

df['embeddings'] = df['tokens'].apply(get_word_embeddings)

df.head()

Unnamed: 0,name,class,description,tokens,embeddings
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,"[Listening, in, to, a, conversation, between, ...","[0.04766588, 0.01824178, 0.0030127345, 0.06860..."
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...,"[A, brother, and, sister, with, a, past, inces...","[-0.028375696, 0.06363706, 0.014784072, 0.0590..."
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,"[As, the, bus, empties, the, students, for, th...","[0.04315985, 0.030337507, 0.009937395, 0.07858..."
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,"[To, help, their, unemployed, father, make, en...","[0.028479205, 0.026918862, -0.0075246845, 0.02..."
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...,"[The, film, 's, title, refers, not, only, to, ...","[0.061470397, 0.024994623, 0.03150613, 0.07394..."


In [21]:
X = df['embeddings']
y = df['class'] # Target or the labels we want to predict (i.e., the 13 different complaints of products)

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.20,
                                                    random_state = 20)

# Train LinearSVC

In [22]:
# Train LinearSVC model from sklearn and evaluate it with cross-val_score
svc = LinearSVC(random_state=20, tol=1e-5)
scores = cross_val_score(svc, list(X_train), y_train, cv=5)
print(scores)

[0.47133059 0.47599451 0.47462277 0.47709191 0.48572997]


In [27]:
# Test the model on the test data
svc.fit(list(X_train), y_train)
#test_score = svc.score(list(X_test), y_test)
#test_score

# Use metrics.classification_report to evaluate the model
y_pred = svc.predict(list(X_test))
print(metrics.classification_report(y_test, y_pred, target_names=df['class'].unique(), zero_division=1))

# print scores
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred, average='weighted', zero_division=1))
print("Recall:", metrics.recall_score(y_test, y_pred, average='weighted'))
print("F1-Score:", metrics.f1_score(y_test, y_pred, average='weighted', zero_division=1))


               precision    recall  f1-score   support

       drama        0.45      0.49      0.47       260
    thriller        0.53      0.61      0.57       106
       adult        0.46      0.17      0.25       167
 documentary        0.32      0.15      0.21        85
      comedy        0.00      0.00      0.00        58
       crime        0.43      0.54      0.48       421
  reality-tv        0.32      0.06      0.10       113
      horror        0.49      0.68      0.57       417
       sport        0.34      0.39      0.36       393
   animation        0.47      0.29      0.35       140
      action        0.40      0.07      0.12        59
     fantasy        0.78      0.68      0.73        41
       short        0.23      0.06      0.10        49
      sci-fi        0.55      0.78      0.65       406
       music        0.61      0.83      0.70       163
   adventure        0.50      0.08      0.14        48
   talk-show        1.00      0.00      0.00        50
     west

### Results
- With Down and Up-sampling: 
    - macro avg: ca. 43% on accuracy, precision, recall and f1-score
    - weighted avg: ca. 43% on accuracy, precision, recall and f1-score
    - -> Mid score on both big and small classes
- With Downsampling: 
    - macro avg: 44%, 38%, 38% on precision, recall and f1-score
    - weighted avg: 46%, 48%, 45% on precision, recall and f1-scoore
    - -> In between the other two experiments
- Without both: 
    - macro avg: 49%, 26%, 29% on precision, recall and f1-score
    - weighted avg: 55%, 57%, 52% on precision, recall and f1-score
    - -> High score on big classes, low score on small classes

# Train RandomForestClassifier

In [28]:
# Use random forest classifier
rfc = RandomForestClassifier(random_state=20)

# Test the model on the test data
rfc.fit(list(X_train), y_train)
test_score = rfc.score(list(X_test), y_test)
test_score

0.37568575817423744

In [30]:
# Use metrics.classification_report to evaluate the model
y_pred = rfc.predict(list(X_test))
print(metrics.classification_report(y_test, y_pred, target_names=df['class'].unique(), zero_division=1))

               precision    recall  f1-score   support

       drama        0.42      0.44      0.43       260
    thriller        0.51      0.17      0.26       106
       adult        0.50      0.05      0.10       167
 documentary        0.22      0.02      0.04        85
      comedy        0.00      0.00      0.00        58
       crime        0.30      0.45      0.36       421
  reality-tv        1.00      0.03      0.05       113
      horror        0.35      0.73      0.47       417
       sport        0.26      0.42      0.32       393
   animation        0.24      0.03      0.05       140
      action        1.00      0.00      0.00        59
     fantasy        0.80      0.29      0.43        41
       short        1.00      0.00      0.00        49
      sci-fi        0.42      0.67      0.52       406
       music        0.60      0.78      0.68       163
   adventure        1.00      0.00      0.00        48
   talk-show        1.00      0.00      0.00        50
     west