In [131]:
#importing library
import numpy as np
import pandas as pd
import plotly.express as px
from tqdm import tqdm
from ast import literal_eval
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_addons as tfa
from sklearn.decomposition import pca

In [83]:
#loading the dataset
#rating dataframe
rating_df=pd.read_csv('../input/the-movies-dataset/ratings.csv')
keyword_df=pd.read_csv('../input/the-movies-dataset/keywords.csv')

In [84]:
#loading keyword dataset
keyword_df

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
...,...,...
46414,439050,"[{'id': 10703, 'name': 'tragic love'}]"
46415,111109,"[{'id': 2679, 'name': 'artist'}, {'id': 14531,..."
46416,67758,[]
46417,227506,[]


In [85]:
rating_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556
...,...,...,...,...
26024284,270896,58559,5.0,1257031564
26024285,270896,60069,5.0,1257032032
26024286,270896,63082,4.5,1257031764
26024287,270896,64957,4.5,1257033990


In [86]:
#renaming column name in keyword dataframe
keyword_df.columns=['movieId','keywords']

# Preprocessing

In [87]:
#literal_eval converts list in the string and convert it into a list
word_dictionary={}
#tqdm creating the process bar
for word_list in tqdm(keyword_df['keywords']):
    for word in literal_eval(word_list):
        word_dictionary[word['id']]=word['name']

100%|██████████| 46419/46419 [00:02<00:00, 20937.19it/s]


In [88]:
#check the length of the dictionary word_dictionary
len(word_dictionary)

19956

In [89]:
#dropping the unnecessary column in rating_df dataset
rating_df=rating_df.drop(['userId','timestamp'],axis=1)

In [90]:
word_dictionary

{931: 'jealousy',
 4290: 'toy',
 5202: 'boy',
 6054: 'friendship',
 9713: 'friends',
 9823: 'rivalry',
 165503: 'boy next door',
 170722: 'new toy',
 187065: 'toy comes to life',
 10090: 'board game',
 10941: 'disappearance',
 15101: "based on children's book",
 33467: 'new home',
 158086: 'recluse',
 158091: 'giant insect',
 1495: 'fishing',
 12392: 'best friend',
 179431: 'duringcreditsstinger',
 208510: 'old men',
 818: 'based on novel',
 10131: 'interracial relationship',
 14768: 'single mother',
 15160: 'divorce',
 33455: 'chick flick',
 1009: 'baby',
 1599: 'midlife crisis',
 2246: 'confidence',
 4995: 'aging',
 5600: 'daughter',
 10707: 'mother daughter relationship',
 13149: 'pregnancy',
 33358: 'contraception',
 170521: 'gynecologist',
 642: 'robbery',
 703: 'detective',
 974: 'bank',
 1523: 'obsession',
 3713: 'chase',
 7281: 'shooting',
 9727: 'thief',
 9812: 'honor',
 9826: 'murder',
 9937: 'suspense',
 10051: 'heist',
 10085: 'betrayal',
 10594: 'money',
 10726: 'gang',
 1

In [91]:
rating_df

Unnamed: 0,movieId,rating
0,110,1.0
1,147,4.5
2,858,5.0
3,1221,5.0
4,1246,5.0
...,...,...
26024284,58559,5.0
26024285,60069,5.0
26024286,63082,4.5
26024287,64957,4.5


In [92]:

for group in rating_df.groupby(rating_df['movieId'][0:5]):
    print(group)

(110.0,    movieId  rating
0      110     1.0)
(147.0,    movieId  rating
1      147     4.5)
(858.0,    movieId  rating
2      858     5.0)
(1221.0,    movieId  rating
3     1221     5.0)
(1246.0,    movieId  rating
4     1246     5.0)


In [93]:
# grouping the dataframe in movideid column and then aggregate it with movie mean
rating_df=rating_df.groupby(rating_df['movieId']).aggregate({'rating':'mean'})
#loading the final dataset
rating_df

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,3.888157
2,3.236953
3,3.175550
4,2.875713
5,3.079565
...,...
176267,4.000000
176269,3.500000
176271,5.000000
176273,1.000000


In [94]:
rating_df

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,3.888157
2,3.236953
3,3.175550
4,2.875713
5,3.079565
...,...
176267,4.000000
176269,3.500000
176271,5.000000
176273,1.000000


In [95]:
keyword_df

Unnamed: 0,movieId,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
...,...,...
46414,439050,"[{'id': 10703, 'name': 'tragic love'}]"
46415,111109,"[{'id': 2679, 'name': 'artist'}, {'id': 14531,..."
46416,67758,[]
46417,227506,[]


In [96]:
#merging keyword_df and rating_df on movide id column
train_df=keyword_df.merge(rating_df,on='movieId')

In [97]:
train_df

Unnamed: 0,movieId,keywords,rating
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",3.598930
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1...",3.760163
2,949,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '...",3.905544
3,710,"[{'id': 701, 'name': 'cuba'}, {'id': 769, 'nam...",2.740334
4,1408,"[{'id': 911, 'name': 'exotic island'}, {'id': ...",3.710181
...,...,...,...
7663,98604,[],3.795053
7664,5589,"[{'id': 3205, 'name': 'fairy tale'}, {'id': 13...",3.126140
7665,45527,[],2.791667
7666,49280,[],3.390625


In [98]:
#dropping those rows which has empty list in keywords column
train_df=train_df.drop(train_df[train_df['keywords']=='[]'].index,axis=0).reset_index(drop=True)

In [99]:
#extracting id from each row in keywords column
train_df['keywords']=train_df['keywords'].apply(lambda word_list: [word_dict['id']for word_dict in literal_eval(word_list)])

In [100]:
train_df

Unnamed: 0,movieId,keywords,rating
0,862,"[931, 4290, 5202, 6054, 9713, 9823, 165503, 17...",3.598930
1,8844,"[10090, 10941, 15101, 33467, 158086, 158091]",3.760163
2,949,"[642, 703, 974, 1523, 3713, 7281, 9727, 9812, ...",3.905544
3,710,"[701, 769, 1308, 2812, 3268, 3272, 3278, 3376,...",2.740334
4,1408,"[911, 1454, 1969, 3799, 5470, 12988]",3.710181
...,...,...,...
5756,79927,"[716, 717, 10093, 154788]",3.224490
5757,149946,[187056],0.500000
5758,64197,[187056],3.788051
5759,5589,"[3205, 13027, 155697, 196541]",3.126140


In [101]:
#Splitting the dataset
#train_df.loc[all rows,column name]
y=train_df.loc[:,'rating']
x_raw=train_df.loc[:,'keywords']

In [102]:
x_raw

0       [931, 4290, 5202, 6054, 9713, 9823, 165503, 17...
1            [10090, 10941, 15101, 33467, 158086, 158091]
2       [642, 703, 974, 1523, 3713, 7281, 9727, 9812, ...
3       [701, 769, 1308, 2812, 3268, 3272, 3278, 3376,...
4                    [911, 1454, 1969, 3799, 5470, 12988]
                              ...                        
5756                            [716, 717, 10093, 154788]
5757                                             [187056]
5758                                             [187056]
5759                        [3205, 13027, 155697, 196541]
5760                                [2679, 14531, 215397]
Name: keywords, Length: 5761, dtype: object

In [103]:
word_counts={}
for word_list in x_raw:
    for word in word_list:
        if word in word_counts:
            word_counts[word]+=1
        else:
            word_counts[word]=1

In [104]:
#reversing key to values and value to key and making it in descending order
word_counted_sorted={key:value for key,value in sorted(word_counts.items(),key=lambda item:item[1],reverse=True)}

In [105]:
#taking first hundred from word_counted_sorted dictionary
vocabulary=list(word_counted_sorted)[:100]


In [106]:
#iterating through the x_raw and selection only those row which has words in vocabular
for word_list in x_raw:
    word_list[:]=[word for word in word_list if word in vocabulary]

In [107]:
x_raw

0                         [931, 6054]
1                                  []
2       [642, 703, 9826, 9937, 10594]
3                                  []
4                              [3799]
                    ...              
5756                               []
5757                         [187056]
5758                         [187056]
5759                          [13027]
5760                               []
Name: keywords, Length: 5761, dtype: object

In [108]:
#adding these index number which has empty values 
null_indices=set()
for i,words in enumerate(x_raw):
    if not words:
        null_indices.add(i)

In [109]:
#droping the null indices and then reseting the index
x_raw=x_raw.drop(null_indices,axis=0).reset_index(drop=True)
y=y.drop(null_indices,axis=0).reset_index(drop=True)

In [110]:
mlb=MultiLabelBinarizer()
x=pd.DataFrame(mlb.fit_transform(x_raw))

In [111]:
#creating unique list of all the word in x_raw
word_column_names=[]
for word_list in x_raw:
    for word in word_list:
        if word not in word_column_names:
            word_column_names.append(word)

In [112]:
word_column_names

[931,
 6054,
 642,
 703,
 9826,
 9937,
 10594,
 3799,
 1228,
 818,
 612,
 616,
 12670,
 187056,
 13142,
 14819,
 14964,
 10714,
 30,
 549,
 3737,
 1566,
 2041,
 4565,
 10084,
 4379,
 378,
 570,
 596,
 6149,
 158718,
 4344,
 6027,
 128,
 380,
 1664,
 2483,
 9748,
 10183,
 494,
 11221,
 34094,
 236,
 1956,
 90,
 1157,
 2038,
 2343,
 13005,
 5565,
 195402,
 9663,
 572,
 179430,
 6091,
 13008,
 9673,
 3800,
 14534,
 18035,
 736,
 5600,
 10508,
 242,
 417,
 15162,
 34079,
 11612,
 237,
 13027,
 779,
 1930,
 233,
 1415,
 1155,
 9951,
 10685,
 255,
 966,
 15160,
 13006,
 970,
 6038,
 2652,
 65,
 212,
 470,
 6075,
 1326,
 1605,
 13130,
 10292,
 293,
 254,
 220,
 12377,
 6270,
 1299,
 154802,
 179431]

In [113]:
#multilabelbinarizer encode multiple item in the list
mlb=MultiLabelBinarizer()
X=pd.DataFrame(mlb.fit_transform(x_raw),columns=word_column_names)

In [114]:
x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3680,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3681,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3682,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3683,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [115]:
#mapping one list to another list
word_column_names=list(map(lambda x: word_dictionary[x],word_column_names))

In [116]:
x_raw

0                         [931, 6054]
1       [642, 703, 9826, 9937, 10594]
2                              [3799]
3                              [1228]
4                               [818]
                    ...              
3680       [293, 2483, 10183, 187056]
3681                   [10292, 12377]
3682                         [187056]
3683                         [187056]
3684                          [13027]
Name: keywords, Length: 3685, dtype: object

In [117]:
#splitting the dataset into train and test dataset
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=24)

In [118]:
x.shape

(3685, 100)

In [119]:
inputs=tf.keras.Input(shape=(100,))
x=tf.keras.layers.Dense(64,activation='relu')(inputs)
x=tf.keras.layers.Dense(64,activation='relu')(x)
outputs=tf.keras.layers.Dense(1,activation='linear')(x)
model=tf.keras.Model(inputs,outputs)

In [120]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
dense_8 (Dense)              (None, 64)                6464      
_________________________________________________________________
dense_9 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 65        
Total params: 10,689
Trainable params: 10,689
Non-trainable params: 0
_________________________________________________________________


In [121]:
model.compile(optimizer='adam',
             loss='mse')
batch_size=32
epochs=100
history=model.fit(x_train,y_train,validation_split=0.2,
                 batch_size=batch_size,epochs=epochs,
                 callbacks=[tf.keras.callbacks.ReduceLROnPlateau()])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [122]:
fig=px.line(
    history.history,
    y=['loss','val_loss'],
    labels={'x':'Epoch','y':'Loss'},
    title='Loss over Time')
fig.show()

In [123]:
model.evaluate(x_test,y_test)



0.349179208278656

In [124]:
y_preds=model.predict(x_test)
y_preds=np.squeeze(y_preds)

In [125]:
y_test.shape

(1106,)

In [126]:
y_preds.shape

(1106,)

In [127]:
rsquare=tfa.metrics.RSquare()
rsquare.update_state(y_test,y_preds)

In [128]:
print('R^ Score',rsquare.result().numpy())

R^ Score -0.1717391


In [134]:
pd.DataFrame(pca.fit_transform(X),columns=['PC1','PC2'])

AttributeError: module 'sklearn.decomposition._pca' has no attribute 'fit_transform'

In [130]:
X

Unnamed: 0,931,6054,642,703,9826,9937,10594,3799,1228,818,...,13130,10292,293,254,220,12377,6270,1299,154802,179431
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3680,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3681,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3682,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3683,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
