In [46]:
#importing library
import pandas as pd
import numpy as np
import plotly.express as px
from tqdm import tqdm
from ast import literal_eval
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import tensorflow as tf

from sklearn.decomposition import PCA

In [47]:
#loading the dataset
rating_df=pd.read_csv('/kaggle/input/the-movies-dataset/ratings.csv')
keywords_df=pd.read_csv('/kaggle/input/the-movies-dataset/keywords.csv')

In [48]:
rating_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556
...,...,...,...,...
26024284,270896,58559,5.0,1257031564
26024285,270896,60069,5.0,1257032032
26024286,270896,63082,4.5,1257031764
26024287,270896,64957,4.5,1257033990


In [49]:
keywords_df

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
...,...,...
46414,439050,"[{'id': 10703, 'name': 'tragic love'}]"
46415,111109,"[{'id': 2679, 'name': 'artist'}, {'id': 14531,..."
46416,67758,[]
46417,227506,[]


In [50]:
keywords_df.columns=['movieId','keywords']

In [51]:
keywords_df

Unnamed: 0,movieId,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
...,...,...
46414,439050,"[{'id': 10703, 'name': 'tragic love'}]"
46415,111109,"[{'id': 2679, 'name': 'artist'}, {'id': 14531,..."
46416,67758,[]
46417,227506,[]


In [52]:
word_dictionary={}
                    #literal_eval convert list inside the string to actual list
for word_list in tqdm(keywords_df['keywords']):
    for word in literal_eval(word_list):
        word_dictionary[word['id']]=word['name']

        
    
    

100%|██████████| 46419/46419 [00:02<00:00, 19620.25it/s]


In [53]:
len(word_dictionary)

19956

In [54]:
rating_df=rating_df.drop(['userId','timestamp'],axis=1)

In [55]:
rating_df

Unnamed: 0,movieId,rating
0,110,1.0
1,147,4.5
2,858,5.0
3,1221,5.0
4,1246,5.0
...,...,...
26024284,58559,5.0
26024285,60069,5.0
26024286,63082,4.5
26024287,64957,4.5


In [56]:
rating_df=rating_df.groupby(rating_df['movieId']).aggregate({'rating':'mean'})

In [57]:
rating_df

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,3.888157
2,3.236953
3,3.175550
4,2.875713
5,3.079565
...,...
176267,4.000000
176269,3.500000
176271,5.000000
176273,1.000000


In [58]:
train_df=keywords_df.merge(rating_df,on='movieId')

In [59]:
train_df

Unnamed: 0,movieId,keywords,rating
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",3.598930
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1...",3.760163
2,949,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '...",3.905544
3,710,"[{'id': 701, 'name': 'cuba'}, {'id': 769, 'nam...",2.740334
4,1408,"[{'id': 911, 'name': 'exotic island'}, {'id': ...",3.710181
...,...,...,...
7663,98604,[],3.795053
7664,5589,"[{'id': 3205, 'name': 'fairy tale'}, {'id': 13...",3.126140
7665,45527,[],2.791667
7666,49280,[],3.390625


In [60]:
train_df=train_df.drop(train_df[train_df['keywords']=='[]'].index,axis=0).reset_index(drop=True)


In [61]:
train_df

Unnamed: 0,movieId,keywords,rating
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",3.598930
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1...",3.760163
2,949,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '...",3.905544
3,710,"[{'id': 701, 'name': 'cuba'}, {'id': 769, 'nam...",2.740334
4,1408,"[{'id': 911, 'name': 'exotic island'}, {'id': ...",3.710181
...,...,...,...
5756,79927,"[{'id': 716, 'name': 'vatican'}, {'id': 717, '...",3.224490
5757,149946,"[{'id': 187056, 'name': 'woman director'}]",0.500000
5758,64197,"[{'id': 187056, 'name': 'woman director'}]",3.788051
5759,5589,"[{'id': 3205, 'name': 'fairy tale'}, {'id': 13...",3.126140


In [62]:
train_df['keywords']=train_df['keywords'].apply(lambda word_list:[word_dict['id'] for word_dict in literal_eval(word_list)])

In [63]:
train_df

Unnamed: 0,movieId,keywords,rating
0,862,"[931, 4290, 5202, 6054, 9713, 9823, 165503, 17...",3.598930
1,8844,"[10090, 10941, 15101, 33467, 158086, 158091]",3.760163
2,949,"[642, 703, 974, 1523, 3713, 7281, 9727, 9812, ...",3.905544
3,710,"[701, 769, 1308, 2812, 3268, 3272, 3278, 3376,...",2.740334
4,1408,"[911, 1454, 1969, 3799, 5470, 12988]",3.710181
...,...,...,...
5756,79927,"[716, 717, 10093, 154788]",3.224490
5757,149946,[187056],0.500000
5758,64197,[187056],3.788051
5759,5589,"[3205, 13027, 155697, 196541]",3.126140


In [64]:
#Splitting the data
x=train_df['keywords']
y=train_df['rating']

In [65]:
x

0       [931, 4290, 5202, 6054, 9713, 9823, 165503, 17...
1            [10090, 10941, 15101, 33467, 158086, 158091]
2       [642, 703, 974, 1523, 3713, 7281, 9727, 9812, ...
3       [701, 769, 1308, 2812, 3268, 3272, 3278, 3376,...
4                    [911, 1454, 1969, 3799, 5470, 12988]
                              ...                        
5756                            [716, 717, 10093, 154788]
5757                                             [187056]
5758                                             [187056]
5759                        [3205, 13027, 155697, 196541]
5760                                [2679, 14531, 215397]
Name: keywords, Length: 5761, dtype: object

In [66]:
word_counts={}

for word in x:
    for word in word:
        if word in word_counts:
            word_counts[word]+=1
        else:
            word_counts[word]=1

In [67]:
word_counts

{931: 72,
 4290: 4,
 5202: 14,
 6054: 112,
 9713: 32,
 9823: 18,
 165503: 3,
 170722: 2,
 187065: 3,
 10090: 1,
 10941: 12,
 15101: 3,
 33467: 3,
 158086: 2,
 158091: 4,
 642: 55,
 703: 71,
 974: 18,
 1523: 33,
 3713: 27,
 7281: 8,
 9727: 20,
 9812: 4,
 9826: 248,
 9937: 121,
 10051: 19,
 10085: 22,
 10594: 49,
 10726: 26,
 15076: 1,
 18023: 3,
 34117: 26,
 156121: 9,
 159343: 3,
 159434: 2,
 167104: 3,
 192261: 1,
 207268: 23,
 208009: 2,
 214983: 2,
 701: 14,
 769: 18,
 1308: 26,
 2812: 6,
 3268: 5,
 3272: 10,
 3278: 9,
 3376: 5,
 3531: 3,
 3560: 2,
 3561: 1,
 3562: 2,
 3563: 4,
 3564: 2,
 193008: 3,
 911: 17,
 1454: 19,
 1969: 6,
 3799: 41,
 5470: 1,
 12988: 14,
 383: 7,
 726: 15,
 1228: 40,
 2635: 9,
 33625: 21,
 420: 3,
 818: 148,
 964: 7,
 2755: 9,
 7564: 2,
 10911: 11,
 11109: 6,
 15060: 5,
 156507: 2,
 156510: 1,
 156512: 1,
 612: 44,
 613: 11,
 616: 42,
 622: 6,
 922: 19,
 2700: 2,
 12670: 89,
 160488: 21,
 187056: 465,
 198129: 1,
 395: 26,
 416: 9,
 8438: 4,
 9935: 19,
 1039

In [68]:
word_counts_sorted={key:value for key,value in sorted(word_counts.items(),key=lambda item:item[1],reverse=True)}

In [69]:
word_counts_sorted

{187056: 465,
 10183: 313,
 9826: 248,
 818: 148,
 572: 147,
 14819: 146,
 9748: 142,
 9937: 121,
 2483: 120,
 4344: 119,
 6054: 112,
 3737: 108,
 293: 108,
 90: 105,
 5565: 100,
 378: 99,
 6149: 99,
 242: 99,
 9673: 98,
 179431: 94,
 4565: 91,
 570: 91,
 12670: 89,
 1956: 85,
 212: 85,
 236: 84,
 9663: 83,
 14964: 82,
 5600: 81,
 10508: 78,
 549: 77,
 494: 76,
 2038: 76,
 931: 72,
 703: 71,
 34094: 71,
 13027: 67,
 6075: 67,
 18035: 66,
 13130: 65,
 1299: 64,
 195402: 63,
 237: 63,
 596: 62,
 179430: 62,
 154802: 62,
 10685: 60,
 380: 57,
 1157: 57,
 470: 56,
 1605: 56,
 642: 55,
 1930: 55,
 9951: 55,
 6091: 54,
 3800: 54,
 255: 54,
 6038: 54,
 11221: 53,
 13005: 51,
 11612: 51,
 1415: 51,
 2343: 50,
 10594: 49,
 1566: 48,
 13006: 48,
 10714: 47,
 417: 47,
 233: 47,
 966: 47,
 970: 46,
 65: 45,
 612: 44,
 10084: 44,
 779: 44,
 2652: 44,
 12377: 44,
 128: 43,
 34079: 43,
 616: 42,
 13142: 42,
 4379: 42,
 6027: 42,
 14534: 42,
 736: 42,
 1155: 42,
 3799: 41,
 158718: 41,
 13008: 41,
 13

In [70]:
vocabulary=list(word_counts_sorted)[:100]
vocabulary

[187056,
 10183,
 9826,
 818,
 572,
 14819,
 9748,
 9937,
 2483,
 4344,
 6054,
 3737,
 293,
 90,
 5565,
 378,
 6149,
 242,
 9673,
 179431,
 4565,
 570,
 12670,
 1956,
 212,
 236,
 9663,
 14964,
 5600,
 10508,
 549,
 494,
 2038,
 931,
 703,
 34094,
 13027,
 6075,
 18035,
 13130,
 1299,
 195402,
 237,
 596,
 179430,
 154802,
 10685,
 380,
 1157,
 470,
 1605,
 642,
 1930,
 9951,
 6091,
 3800,
 255,
 6038,
 11221,
 13005,
 11612,
 1415,
 2343,
 10594,
 1566,
 13006,
 10714,
 417,
 233,
 966,
 970,
 65,
 612,
 10084,
 779,
 2652,
 12377,
 128,
 34079,
 616,
 13142,
 4379,
 6027,
 14534,
 736,
 1155,
 3799,
 158718,
 13008,
 1326,
 10292,
 254,
 1228,
 15162,
 6270,
 15160,
 220,
 30,
 2041,
 1664]

In [71]:
len(vocabulary)

100

In [72]:
for word_list in x:
    word_list[:]=[word for word in word_list if word in vocabulary]

In [73]:
x

0                         [931, 6054]
1                                  []
2       [642, 703, 9826, 9937, 10594]
3                                  []
4                              [3799]
                    ...              
5756                               []
5757                         [187056]
5758                         [187056]
5759                          [13027]
5760                               []
Name: keywords, Length: 5761, dtype: object

In [74]:
null_indices=set()


for i,words in enumerate(x):
    if not words:
        null_indices.add(i)

In [75]:
null_indices

{1,
 3,
 17,
 24,
 30,
 40,
 45,
 46,
 48,
 54,
 55,
 56,
 59,
 65,
 66,
 67,
 70,
 75,
 78,
 82,
 84,
 87,
 91,
 96,
 98,
 99,
 100,
 107,
 114,
 116,
 119,
 124,
 128,
 150,
 151,
 152,
 158,
 160,
 161,
 163,
 167,
 170,
 172,
 182,
 183,
 184,
 188,
 189,
 192,
 193,
 196,
 199,
 202,
 207,
 208,
 211,
 216,
 219,
 222,
 242,
 245,
 247,
 251,
 254,
 255,
 261,
 268,
 270,
 272,
 276,
 277,
 280,
 282,
 283,
 289,
 290,
 300,
 301,
 305,
 306,
 342,
 351,
 353,
 356,
 361,
 363,
 364,
 365,
 372,
 376,
 389,
 392,
 396,
 399,
 400,
 401,
 402,
 406,
 407,
 409,
 410,
 411,
 412,
 418,
 420,
 424,
 430,
 441,
 444,
 446,
 456,
 458,
 472,
 473,
 477,
 479,
 490,
 494,
 495,
 502,
 504,
 507,
 508,
 510,
 511,
 512,
 515,
 516,
 520,
 521,
 522,
 523,
 527,
 528,
 529,
 530,
 532,
 544,
 552,
 554,
 559,
 565,
 569,
 570,
 572,
 573,
 578,
 579,
 581,
 586,
 593,
 599,
 600,
 606,
 613,
 617,
 619,
 621,
 630,
 633,
 645,
 647,
 651,
 656,
 660,
 662,
 664,
 665,
 667,
 670,
 679,
 6

In [76]:
x=x.drop(null_indices,axis=0).reset_index(drop=True)
y=y.drop(null_indices,axis=0).reset_index(drop=True)

In [77]:
x

0                         [931, 6054]
1       [642, 703, 9826, 9937, 10594]
2                              [3799]
3                              [1228]
4                               [818]
                    ...              
3680       [293, 2483, 10183, 187056]
3681                   [10292, 12377]
3682                         [187056]
3683                         [187056]
3684                          [13027]
Name: keywords, Length: 3685, dtype: object

In [78]:
y

0       3.598930
1       3.905544
2       3.710181
3       3.515170
4       2.718412
          ...   
3680    2.857143
3681    2.875000
3682    0.500000
3683    3.788051
3684    3.126140
Name: rating, Length: 3685, dtype: float64

In [79]:
word_column_names=[]

for word_list in x:
    for word in word_list:
        if word not in word_column_names:
            word_column_names.append(word)


In [80]:
word_column_names=list(map(lambda x: word_dictionary[x],word_column_names))

In [81]:
mlb=MultiLabelBinarizer()
x=pd.DataFrame(mlb.fit_transform(x),columns=word_column_names)

In [82]:
x

Unnamed: 0,jealousy,friendship,robbery,detective,murder,suspense,money,ship,1970s,based on novel,...,teenager,gore,female nudity,france,berlin,zombie,high school,monster,silent film,duringcreditsstinger
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3680,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3681,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3682,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3683,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


# Traininig Model

In [83]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=23)

In [84]:
x.shape

(3685, 100)

In [90]:
inputs=tf.keras.Input(shape=(100,))
x=tf.keras.layers.Dense(64,activation='relu')(inputs)
x=tf.keras.layers.Dense(64,activation='relu')(x)
outputs=tf.keras.layers.Dense(1,activation='linear')(x)

model=tf.keras.Model(inputs=inputs,outputs=outputs)

model.compile(optimizer='adam',loss='mse')
batch_size=32
epochs=10

history=model.fit(x_train,y_train,validation_split=0.2,batch_size=batch_size,epochs=epochs,
                 callbacks=[tf.keras.callbacks.ReduceLROnPlateau()])

Epoch 1/10


I0000 00:00:1724826149.870467     985 service.cc:145] XLA service 0x78abe4005d40 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1724826149.870526     985 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0


[1m36/65[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 1ms/step - loss: 8.7288  

I0000 00:00:1724826150.711978     985 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - loss: 7.2589 - val_loss: 1.0286 - learning_rate: 0.0010
Epoch 2/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.8445 - val_loss: 0.5788 - learning_rate: 0.0010
Epoch 3/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.5310 - val_loss: 0.4653 - learning_rate: 0.0010
Epoch 4/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.4185 - val_loss: 0.4128 - learning_rate: 0.0010
Epoch 5/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3809 - val_loss: 0.3904 - learning_rate: 0.0010
Epoch 6/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3469 - val_loss: 0.3776 - learning_rate: 0.0010
Epoch 7/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3309 - val_loss: 0.3699 - learning_rate: 0.0010
Epoch 8/10
[1m6

In [98]:
fig=px.line(history.history,y=['loss','val_loss'],
         labels= {'x':'Epochs','y':"Loss"},
           title='Loss Over Time')

fig.show()

In [96]:
model.evaluate(x_test,y_test)

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.3299 


0.3534304201602936

In [105]:
y_pred=model.predict(x_test)

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


In [103]:
y_test=y_test.to_numpy()

In [104]:
y_test

array([3.01587302, 3.56557377, 2.75802139, ..., 3.5       , 2.42718447,
       1.3       ])

In [106]:
y_pred=np.squeeze(y_pred)

In [107]:
y_pred

array([3.0829477, 3.1171885, 3.3817916, ..., 3.2296586, 3.0719543,
       3.2082968], dtype=float32)