# Movie Genre Classification

In [1]:
import pandas as pd

In [2]:
trainDf = pd.read_csv("./data/train_data.txt", delimiter=" ::: ", engine="python", names=["id", "title", "genre", "description"])
trainDf.head()

Unnamed: 0,id,title,genre,description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [3]:
trainDf.shape

(54214, 4)

In [4]:
trainDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54214 entries, 0 to 54213
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           54214 non-null  int64 
 1   title        54214 non-null  object
 2   genre        54214 non-null  object
 3   description  54214 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.7+ MB


In [5]:
trainDf.isna().sum()

id             0
title          0
genre          0
description    0
dtype: int64

In [6]:
trainDf[trainDf.duplicated() == True]

Unnamed: 0,id,title,genre,description


## Preprocessing the training data

In [7]:
trainDf.head()

Unnamed: 0,id,title,genre,description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


### let's convert the categorical values into numerical values

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
encoder = LabelEncoder()
trainDf["genre"] = encoder.fit_transform(trainDf["genre"])

trainDf.head()

Unnamed: 0,id,title,genre,description
0,1,Oscar et la dame rose (2009),8,Listening in to a conversation between his doc...
1,2,Cupid (1997),24,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",1,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),8,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),8,The film's title refers not only to the un-rec...


**Note:** Rather than vectorizing title and description separately let's merge them and then vectorize them

In [10]:
trainDf["movie"] = trainDf["title"] + " " + trainDf["description"]

In [11]:
trainDf.head()

Unnamed: 0,id,title,genre,description,movie
0,1,Oscar et la dame rose (2009),8,Listening in to a conversation between his doc...,Oscar et la dame rose (2009) Listening in to a...
1,2,Cupid (1997),24,A brother and sister with a past incestuous re...,Cupid (1997) A brother and sister with a past ...
2,3,"Young, Wild and Wonderful (1980)",1,As the bus empties the students for their fiel...,"Young, Wild and Wonderful (1980) As the bus em..."
3,4,The Secret Sin (1915),8,To help their unemployed father make ends meet...,The Secret Sin (1915) To help their unemployed...
4,5,The Unrecovered (2007),8,The film's title refers not only to the un-rec...,The Unrecovered (2007) The film's title refers...


In [12]:
trainDf.drop(columns=["id", "title", "description"], axis = 1, inplace = True)
trainDf.head()

Unnamed: 0,genre,movie
0,8,Oscar et la dame rose (2009) Listening in to a...
1,24,Cupid (1997) A brother and sister with a past ...
2,1,"Young, Wild and Wonderful (1980) As the bus em..."
3,8,The Secret Sin (1915) To help their unemployed...
4,8,The Unrecovered (2007) The film's title refers...


In [13]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(dtype="int16", stop_words="english")

X_train = vectorizer.fit_transform(trainDf["movie"])

In [14]:
X_train.shape

(54214, 135433)

In [15]:
X_train.dtype

dtype('int16')

In [16]:
y_train = trainDf["genre"]
y_train.shape

(54214,)

In [17]:
type(X_train)

scipy.sparse._csr.csr_matrix

In [18]:
type(y_train)

pandas.core.series.Series

## Model building

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(solver="sag") # 'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'

model.fit(X_train, y_train)



### Test data

In [20]:
testDf = pd.read_csv("./data/test_data.txt", delimiter=" ::: ", engine="python", names=["id", "title", "description"])
testDf.head()

Unnamed: 0,id,title,description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),Before he was known internationally as a marti...


In [21]:
testDf.shape

(54200, 3)

In [22]:
testDf.isna().sum()

id             0
title          0
description    0
dtype: int64

In [23]:
testDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54200 entries, 0 to 54199
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           54200 non-null  int64 
 1   title        54200 non-null  object
 2   description  54200 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [24]:
testDf[testDf.duplicated() == True]

Unnamed: 0,id,title,description


## Preprocessing the test data

In [25]:
testDf.head()

Unnamed: 0,id,title,description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),Before he was known internationally as a marti...


In [26]:
testDf["movie"] = testDf["title"] + " " + testDf["description"]
testDf.drop(columns=["id", "title", "description"], axis = 1, inplace = True)
testDf.head()

Unnamed: 0,movie
0,Edgar's Lunch (1998) L.R. Brane loves his life...
1,"La guerra de papá (1977) Spain, March 1964: Qu..."
2,Off the Beaten Track (2010) One year in the li...
3,"Meu Amigo Hindu (2015) His father has died, he..."
4,Er nu zhai (1955) Before he was known internat...


### vectorized the test data

In [27]:
X_test = vectorizer.transform(testDf["movie"])

In [28]:
predictions = model.predict(X_test)

## Test solution data

In [29]:
testSolDf = pd.read_csv("./data/test_data_solution.txt", delimiter=" ::: ", engine="python",  names=["id", "title","genre", "description"])
testSolDf.head()

Unnamed: 0,id,title,genre,description
0,1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),drama,Before he was known internationally as a marti...


In [30]:
testSolDf["genre"] = encoder.transform(testSolDf["genre"])
testSolDf.head()

Unnamed: 0,id,title,genre,description
0,1,Edgar's Lunch (1998),24,"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),5,"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),7,One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),8,"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),8,Before he was known internationally as a marti...


In [31]:
y_test = testSolDf["genre"]

y_test

0        24
1         5
2         7
3         8
4         8
         ..
54195    13
54196    26
54197     1
54198     8
54199     8
Name: genre, Length: 54200, dtype: int32

In [32]:
accuracy_score(y_test, predictions)

0.597380073800738