# Classification with NLP
**Task:** Predicting *genres* based on *overview*


In [42]:
# ------ Imports & data ------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer  # text -> numeric label
from sklearn.model_selection import train_test_split

df: pd.DataFrame = pd.read_csv('data/raw/TMDB_movie_dataset_v11.csv')

In [None]:
# ------ Preprocess Data ------
columns: list = ['title', 'overview', 'genres']
df: pd.DataFrame = df[columns].dropna().reset_index()
df.head()


Unnamed: 0,index,title,overview,genres
0,0,Inception,"Cobb, a skilled thief who commits corporate es...","Action, Science Fiction, Adventure"
1,1,Interstellar,The adventures of a group of explorers who mak...,"Adventure, Drama, Science Fiction"
2,2,The Dark Knight,Batman raises the stakes in his war on crime. ...,"Drama, Action, Crime, Thriller"
3,3,Avatar,"In the 22nd century, a paraplegic Marine is di...","Action, Adventure, Fantasy, Science Fiction"
4,4,The Avengers,When an unexpected enemy emerges and threatens...,"Science Fiction, Action, Adventure"


In [None]:
# Preprocess genres
df['genres_list'] = df['genres'].str.split(',')  # comma-separated -> python list
df['genres_list'].head()

0              [Action,  Science Fiction,  Adventure]
1               [Adventure,  Drama,  Science Fiction]
2                 [Drama,  Action,  Crime,  Thriller]
3    [Action,  Adventure,  Fantasy,  Science Fiction]
4              [Science Fiction,  Action,  Adventure]
Name: genres_list, dtype: object

In [41]:
# ------ Creating ground truth ------
mlb: MultiLabelBinarizer = MultiLabelBinarizer()

# MLB creates a matrix, where each row has as many columns as there are genres
# filled with zeros, except where the movie is labeled with the genre
# therefore y is ground truth
y = mlb.fit_transform(df['genres_list'])

In [None]:
# ------ Vectorize 'overview' ------
# Transform the 'overview' to a vector representation
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(df['overview'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# ------ Model Selection ------
# We have to extend the LogisticRegression to be able to handle multioutput
# This is done via a Wrapper-Class
model = MultiOutputClassifier(LogisticRegression(max_iter=200))
model.fit(X_train, y_train)

0,1,2
,estimator,LogisticRegre...(max_iter=200)
,n_jobs,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,200


In [None]:
# ------ Test / Prediction & Evaluation------
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
print(type(y_pred))
print(classification_report(y_test, y_pred, target_names=mlb.classes_, zero_division=0))

0.26332049462801543
<class 'numpy.ndarray'>
                  precision    recall  f1-score   support

          Action       0.25      0.01      0.01      2957
       Adventure       0.38      0.02      0.04      2705
       Animation       0.20      0.00      0.00      1664
          Comedy       0.47      0.01      0.02      7658
           Crime       0.32      0.04      0.07      3481
     Documentary       0.33      0.00      0.00      1819
           Drama       0.53      0.01      0.01     12001
          Family       0.42      0.03      0.05      3226
         Fantasy       0.42      0.03      0.05      2915
         History       0.41      0.02      0.05      2217
          Horror       0.20      0.01      0.01      3326
           Music       0.33      0.05      0.09      2872
         Mystery       0.30      0.01      0.03      2642
         Romance       0.37      0.04      0.07      6259
 Science Fiction       0.40      0.09      0.15      2506
        TV Movie       0.35

ValueError: multilabel-indicator is not supported