In [1]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import numpy as np
import pandas as pd
import os

# Get the current working directory
cwd = os.getcwd()

print("Current working directory:", cwd)

Current working directory: /content


In [2]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pd.read_csv('/content/drive/MyDrive/dataset/cleaned_data.csv')
one_hot = pd.read_csv('/content/drive/MyDrive/dataset/one_hot_encoding.csv')

In [4]:
df.shape, one_hot.shape

((544267, 15), (544267, 20))

In [5]:
#Preparing the label for Training

rating = [0, 4, 6, 8, 10]

new_label = ['bad', 'average', 'good', 'favourite']

new_label = {i: label for i, label in zip(rating, new_label)}

df["rating_label"] =  df['vote_average'].map(new_label)

In [6]:
df

Unnamed: 0,index,id,title,status,release_date,revenue,runtime,adult,production_countries,production_companies,budget,genres,overview,vote_average,profit,rating_label
0,336987,412771,Alice Cooper: Live from the Astroturf,Released,2019-04-05,0,60,False,Unknown,"['good records', 'twelve 37 filmworks']",0,"['documentary', 'music']",An independent record store owner and life-lon...,10.000,0,
1,186493,650282,Erotic In-Laws,Released,2019-02-12,0,70,False,South Korea,['(주)가온콘텐츠'],0,"['drama', 'romance']",A man who loves pictures of Japanese women fal...,6.000,0,good
2,36930,166607,Behold a Pale Horse,Released,1964-08-14,0,118,False,United States of America,"['highwood-brentwood production', 'columbia pi...",0,"['drama', 'war']","Manuel Artiguez, a famous bandit during the Sp...",5.859,0,
3,210905,236450,Malibu Beach Party,Released,1940-09-14,0,8,False,United States of America,"['warner bros. pictures', 'leon schlesinger pr...",0,['animation'],Jack Bunny (a spoof of Jack Benny) invites Hol...,5.500,0,
4,220051,124970,Oma & Bella,Released,2012-02-16,0,76,False,"Germany, United States of America",['oscilloscope'],0,['documentary'],Oma & Bella is an intimate glimpse into the wo...,9.000,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544262,666054,1273374,Princess Mary Wedded to Viscount Lascelles at ...,Released,1922-03-02,0,8,False,United Kingdom,"['topical budget company', 'bfi']",0,['documentary'],Lavish coverage of an even more lavish Royal w...,0.000,0,bad
544263,32461,812025,The Silent Twins,Released,2022-09-16,0,113,False,"France, Poland, United Kingdom, United States ...","['kindred spirit', 'madants', 'extreme emotion...",0,['drama'],Feeling isolated from that unwelcoming communi...,7.000,0,
544264,170396,119912,Twelfth Night,Released,1980-01-06,0,128,False,United Kingdom,"['bbc', 'time-life television productions']",0,"['comedy', 'romance', 'tv movie']","Viola and Sebastian are lookalike twins, separ...",6.700,0,
544265,311791,1062678,Hotel Metamorphosis,Released,2013-06-20,1000,27,False,Germany,"['fachhochschule dortmund', 'kunsthochschule f...",1000,['comedy'],Twenty-year-old Dora works as a maid in a hote...,7.000,0,


In [7]:
df.isna().sum()

index                        0
id                           0
title                        0
status                       0
release_date             24386
revenue                      0
runtime                      0
adult                        0
production_countries         0
production_companies         0
budget                       0
genres                       0
overview                     0
vote_average                 0
profit                       0
rating_label            222218
dtype: int64

In [8]:
df.shape

(544267, 16)

In [9]:
labeled = df.dropna(subset=['rating_label'])

In [10]:
labeled.isna().sum()

index                       0
id                          0
title                       0
status                      0
release_date            22988
revenue                     0
runtime                     0
adult                       0
production_countries        0
production_companies        0
budget                      0
genres                      0
overview                    0
vote_average                0
profit                      0
rating_label                0
dtype: int64

In [11]:
merged = pd.merge(labeled, one_hot, on='title')

In [12]:
merged

Unnamed: 0,index,id,title,status,release_date,revenue,runtime,adult,production_countries,production_companies,...,history,horror,music,mystery,romance,science fiction,thriller,tv movie,war,western
0,186493,650282,Erotic In-Laws,Released,2019-02-12,0,70,False,South Korea,['(주)가온콘텐츠'],...,0,0,0,0,1,0,0,0,0,0
1,847637,312614,Paradise Lost: The Life and Times of John Milton,Released,2007-04-24,0,80,False,Unknown,['unknown'],...,0,0,0,0,0,0,0,0,0,0
2,172226,461760,Those Were the Days,Released,1996-06-07,0,101,False,Hong Kong,['united filmmakers organisation (ufo)'],...,0,0,0,0,1,0,0,0,0,0
3,172226,461760,Those Were the Days,Released,1996-06-07,0,101,False,Hong Kong,['united filmmakers organisation (ufo)'],...,0,0,0,0,0,0,0,0,0,0
4,172226,461760,Those Were the Days,Released,1996-06-07,0,101,False,Hong Kong,['united filmmakers organisation (ufo)'],...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
662600,228276,199348,Adria Blues,Released,2013-01-01,0,90,False,"Croatia, Slovenia","['gustav film', 'senca studio', 'filmostovje']",...,0,0,0,0,0,0,0,0,0,0
662601,859001,337202,Grampa's Cabin,Released,2007-11-03,0,12,False,United States of America,['red balloon entertainment'],...,0,0,0,0,0,0,0,0,0,0
662602,484157,963798,Miyah: The Life of a Javanese Woman,Released,1999-01-01,0,30,False,Unknown,['unknown'],...,0,0,0,0,0,0,0,0,0,0
662603,666054,1273374,Princess Mary Wedded to Viscount Lascelles at ...,Released,1922-03-02,0,8,False,United Kingdom,"['topical budget company', 'bfi']",...,0,0,0,0,0,0,0,0,0,0


In [13]:
pd.set_option('display.max_columns', None)  # Set the maximum number of columns to display to None
merged.isna().sum()

index                       0
id                          0
title                       0
status                      0
release_date            52069
revenue                     0
runtime                     0
adult                       0
production_countries        0
production_companies        0
budget                      0
genres                      0
overview                    0
vote_average                0
profit                      0
rating_label                0
action                      0
adventure                   0
animation                   0
comedy                      0
crime                       0
documentary                 0
drama                       0
family                      0
fantasy                     0
history                     0
horror                      0
music                       0
mystery                     0
romance                     0
science fiction             0
thriller                    0
tv movie                    0
war       

In [14]:
merged['rating_label'].isna().sum()

0

In [15]:
print(merged.columns.get_loc('action'))

16


In [24]:
import tensorflow as tf

# Define the tokenizer
overview_data = merged['overview'].values
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)  # Initialize Tokenizer with num_words parameter

tokenizer.fit_on_texts(overview_data)  # Fit tokenizer on text data

# Convert text data to sequences
overview_sequences = tokenizer.texts_to_sequences(overview_data)

# Pad sequences to ensure uniform length
max_len = 100
overview_sequences = tf.keras.preprocessing.sequence.pad_sequences(overview_sequences, maxlen=max_len)


In [17]:
mapping_label = {'bad': 1, 'average': 2, 'good': 3, 'favourite': 4}

merged['rating_label'] = merged['rating_label'].replace(mapping_label)



In [18]:
merged

Unnamed: 0,index,id,title,status,release_date,revenue,runtime,adult,production_countries,production_companies,budget,genres,overview,vote_average,profit,rating_label,action,adventure,animation,comedy,crime,documentary,drama,family,fantasy,history,horror,music,mystery,romance,science fiction,thriller,tv movie,war,western
0,186493,650282,Erotic In-Laws,Released,2019-02-12,0,70,False,South Korea,['(주)가온콘텐츠'],0,"['drama', 'romance']",A man who loves pictures of Japanese women fal...,6.0,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
1,847637,312614,Paradise Lost: The Life and Times of John Milton,Released,2007-04-24,0,80,False,Unknown,['unknown'],0,['documentary'],The life and work of 17th-century English poet...,0.0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,172226,461760,Those Were the Days,Released,1996-06-07,0,101,False,Hong Kong,['united filmmakers organisation (ufo)'],0,"['comedy', 'drama']",Those Were the Days is a Hong Kong Drama starr...,6.0,0,3,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0
3,172226,461760,Those Were the Days,Released,1996-06-07,0,101,False,Hong Kong,['united filmmakers organisation (ufo)'],0,"['comedy', 'drama']",Those Were the Days is a Hong Kong Drama starr...,6.0,0,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,172226,461760,Those Were the Days,Released,1996-06-07,0,101,False,Hong Kong,['united filmmakers organisation (ufo)'],0,"['comedy', 'drama']",Those Were the Days is a Hong Kong Drama starr...,6.0,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
662600,228276,199348,Adria Blues,Released,2013-01-01,0,90,False,"Croatia, Slovenia","['gustav film', 'senca studio', 'filmostovje']",0,['drama'],Aging Bosnian rocker Toni Riff hasn’t written ...,6.0,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
662601,859001,337202,Grampa's Cabin,Released,2007-11-03,0,12,False,United States of America,['red balloon entertainment'],0,['drama'],Short drama,0.0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
662602,484157,963798,Miyah: The Life of a Javanese Woman,Released,1999-01-01,0,30,False,Unknown,['unknown'],0,['documentary'],This is an intimate portrait of a Javanese wom...,0.0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
662603,666054,1273374,Princess Mary Wedded to Viscount Lascelles at ...,Released,1922-03-02,0,8,False,United Kingdom,"['topical budget company', 'bfi']",0,['documentary'],Lavish coverage of an even more lavish Royal w...,0.0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
y = merged['rating_label']

# TRAINING WITH OVERVIEWS

In [28]:
import tensorflow as tf

X = overview_sequences
y = merged['rating_label']
# Define the model architecture
embedding_dim = 100

train_X, test_X,train_y, test_y = train_test_split(X,y,train_size =0.8, random_state=42 )

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=10000, output_dim=embedding_dim, input_length=max_len),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(4, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',  # Use sparse categorical cross-entropy for integer labels
              metrics=['accuracy'])

# Train the model
history = model.fit(train_X, train_y, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model on the validation data
val_loss, val_acc = model.evaluate(test_X,  test_y)
print('Validation accuracy:', val_acc)

# Optionally, test the model on unseen data
# test_loss, test_acc = model.evaluate(test_sequences, test_labels)
# print('Test accuracy:', test_acc)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Validation accuracy: 0.0


# TRAINING WITH GENRES

In [32]:
import tensorflow as tf

X = merged.iloc[:, 16:]
y = merged['rating_label']

# Adjusting labels to be zero-indexed (0-3) since sparse_categorical_crossentropy expects zero-indexed labels
y = y - 1

# Split the data into training and testing sets
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.8, random_state=42)

# Define the model architecture
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(19,)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(4, activation='softmax')  # 4 classes
])

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',  # Use sparse categorical cross-entropy for integer labels
              metrics=['accuracy'])

# Train the model
history = model.fit(train_X, train_y, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model on the test data
val_loss, val_acc = model.evaluate(test_X, test_y)
print('Validation accuracy:', val_acc)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Validation accuracy: 0.8761026263237
