In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
import pickle
import ast
from ast import literal_eval

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

%matplotlib inline

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression

# Movie model

In [6]:
movie = pd.read_csv("prepocessed_movies_metadata.csv",converters={"genres_name": literal_eval, "production_countries": literal_eval, "genres_id": literal_eval})
user = pd.read_csv("rating_with_user_metadata.csv")
movie_with_nan = pd.read_csv("movies_metadata_with_nan.csv",converters={"genres_name": literal_eval, "production_countries": literal_eval, "genres_id": literal_eval})


In [7]:
movie = movie.rename(columns={"id": "movieId"})
movie_with_nan = movie_with_nan.rename(columns={"id": "movieId"})

In [8]:
movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5309 entries, 0 to 5308
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                5309 non-null   float64
 1   movieId               5309 non-null   int64  
 2   production_countries  5309 non-null   object 
 3   original_language     5309 non-null   object 
 4   original_title        5309 non-null   object 
 5   popularity            5309 non-null   float64
 6   revenue               5309 non-null   float64
 7   runtime               5309 non-null   float64
 8   vote_average          5309 non-null   float64
 9   vote_count            5309 non-null   float64
 10  genres_name           5309 non-null   object 
 11  genres_id             5309 non-null   object 
 12  release_year          5309 non-null   int64  
dtypes: float64(6), int64(2), object(5)
memory usage: 539.3+ KB


In [9]:
movie_with_nan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38319 entries, 0 to 38318
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                8564 non-null   float64
 1   movieId               38319 non-null  int64  
 2   production_countries  38319 non-null  object 
 3   original_language     38311 non-null  object 
 4   original_title        38319 non-null  object 
 5   popularity            38319 non-null  float64
 6   revenue               7242 non-null   float64
 7   runtime               38193 non-null  float64
 8   vote_average          36644 non-null  float64
 9   vote_count            38319 non-null  float64
 10  genres_name           38319 non-null  object 
 11  genres_id             38319 non-null  object 
 12  release_year          38319 non-null  int64  
dtypes: float64(6), int64(2), object(5)
memory usage: 3.8+ MB


In [10]:
user.head()

Unnamed: 0,userId,movieId,rating,timestamp,Gender,Age,Occupation,Zip-code
0,1,110,1.0,1425941529,F,1,10,48067
1,1,147,4.5,1425942435,F,1,10,48067
2,1,858,5.0,1425941523,F,1,10,48067
3,1,1221,5.0,1425941546,F,1,10,48067
4,1,1246,5.0,1425941556,F,1,10,48067


In [11]:
def columns_to_dummy(columns, df):
    final_df = df.copy()
    for column in columns:
        print(column)
        df_column_dummy = pd.Series(df[column])
        df_column_dummy = pd.get_dummies(df_column_dummy.apply(pd.Series).stack()).sum(level=0)
        final_df = pd.concat([final_df, df_column_dummy], axis=1)
    final_df.drop(labels=columns, axis=1, inplace=True)
    return final_df

In [12]:
movie_dummy = columns_to_dummy(["original_language", "production_countries", "genres_name"], movie)

original_language
production_countries
genres_name


In [13]:
movie_with_nan_dummy = columns_to_dummy(["original_language", "production_countries", "genres_name"], movie_with_nan)

original_language
production_countries
genres_name


In [27]:
movie_with_nan_dummy = movie_with_nan_dummy.dropna(subset=["revenue"])

In [15]:
movie_dummy

Unnamed: 0,budget,movieId,original_title,popularity,revenue,runtime,vote_average,vote_count,genres_id,release_year,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,30000000.0,862,Toy Story,21.946943,373554033.0,81.0,7.7,5415.0,"[16, 35, 10751]",1995,...,0,0,0,0,0,0,0,0,0,0
1,65000000.0,8844,Jumanji,17.015539,262797249.0,104.0,6.9,2413.0,"[12, 14, 10751]",1995,...,0,0,0,0,0,0,0,0,0,0
2,16000000.0,31357,Waiting to Exhale,3.859495,81452156.0,127.0,6.1,34.0,"[35, 18, 10749]",1995,...,0,0,0,0,1,0,0,0,0,0
3,60000000.0,949,Heat,17.924927,187436818.0,170.0,7.7,1886.0,"[28, 80, 18, 53]",1995,...,0,0,0,0,0,0,0,1,0,0
4,35000000.0,9091,Sudden Death,5.231580,64350171.0,106.0,5.5,174.0,"[28, 12, 53]",1995,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5304,11000000.0,395834,Wind River,40.796775,184770205.0,111.0,7.4,181.0,"[28, 80, 9648, 53]",2017,...,0,0,0,1,0,0,0,1,0,0
5305,12000000.0,24049,சிவாஜி,1.323587,19000000.0,185.0,6.9,25.0,"[28, 35, 18]",2007,...,0,0,0,0,0,0,0,0,0,0
5306,750000.0,280422,Все и сразу,0.201582,3.0,0.0,6.0,4.0,"[80, 35]",2014,...,0,0,0,0,0,0,0,0,0,0
5307,2000000.0,63281,Про любоff,0.121844,1268793.0,107.0,4.0,3.0,"[10749, 18]",2010,...,0,0,0,0,1,0,0,0,0,0


In [28]:
movie_with_nan_dummy

Unnamed: 0,budget,movieId,original_title,popularity,revenue,runtime,vote_average,vote_count,genres_id,release_year,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,30000000.0,862,Toy Story,21.946943,373554033.0,81.0,7.7,5415.0,"[16, 35, 10751]",1995,...,0,0,0,0,0,0,0,0,0,0
1,65000000.0,8844,Jumanji,17.015539,262797249.0,104.0,6.9,2413.0,"[12, 14, 10751]",1995,...,0,0,0,0,0,0,0,0,0,0
3,16000000.0,31357,Waiting to Exhale,3.859495,81452156.0,127.0,6.1,34.0,"[35, 18, 10749]",1995,...,0,0,0,0,1,0,0,0,0,0
4,,11862,Father of the Bride Part II,8.387519,76578911.0,106.0,5.7,173.0,[35],1995,...,0,0,0,0,0,0,0,0,0,0
5,60000000.0,949,Heat,17.924927,187436818.0,170.0,7.7,1886.0,"[28, 80, 18, 53]",1995,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38256,,264269,Vladeniye 18,0.217441,320395.0,90.0,4.4,4.0,"[27, 53, 9648]",2014,...,0,1,0,1,0,0,0,1,0,0
38268,750000.0,280422,Все и сразу,0.201582,3.0,0.0,6.0,4.0,"[80, 35]",2014,...,0,0,0,0,0,0,0,0,0,0
38270,,240789,Чудо,0.436028,50656.0,110.0,6.3,3.0,"[18, 36, 9648]",2009,...,1,0,0,1,0,0,0,0,0,0
38278,2000000.0,63281,Про любоff,0.121844,1268793.0,107.0,4.0,3.0,"[10749, 18]",2010,...,0,0,0,0,1,0,0,0,0,0


In [17]:
columns = movie_dummy.columns.to_list()

In [18]:
columns.remove("original_title")

In [19]:
columns.remove("genres_id")

In [20]:
columns.remove("revenue")

In [21]:
X, y = movie_dummy[columns], movie_dummy['revenue']
lr = LinearRegression()
lr.fit(X, y)
print('R-squared: {:.4f}'.format(lr.score(X, y)))

R-squared: 0.7388


In [29]:
columns = movie_with_nan_dummy.columns.to_list()

In [30]:
columns.remove("original_title")

In [31]:
columns.remove("genres_id")

In [32]:
columns.remove("revenue")

In [33]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X, y = imputer.fit_transform(movie_with_nan_dummy[columns]), movie_with_nan_dummy['revenue']
lr = LinearRegression()
lr.fit(X, y)
print('R-squared: {:.4f}'.format(lr.score(X, y)))

R-squared: 0.7392


# User model

In [None]:
user_dummy = columns_to_dummy(["Gender", "Zip-code"], user)

In [None]:
user_dummy.Age.unique()

In [90]:
user_movie = pd.merge(user_dummy,movie_dummy,left_on='movieId',right_on='movieId',how='left')

In [91]:
user_movie_drop = user_movie.dropna()

In [92]:
user_movie_drop.head()

Unnamed: 0,userId,movieId,rating,timestamp,Age,Occupation,F,M,00231,00606,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
2,1,858,5.0,1425941523,1,10,1,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1,1246,5.0,1425941556,1,10,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1,2959,4.0,1425941601,1,10,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27,2,5,3.0,867039249,56,16,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28,2,25,3.0,867039168,56,16,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [93]:
user_movie_drop.columns

Index(['userId', 'movieId', 'rating', 'timestamp', 'Age', 'Occupation', 'F',
       'M', '00231', '00606',
       ...
       'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction',
       'TV Movie', 'Thriller', 'War', 'Western'],
      dtype='object', length=3595)

In [94]:
columns = user_movie_drop.columns.to_list()

In [95]:
columns.remove("original_title")
columns.remove("genres_id")
columns.remove("rating")

In [96]:
X, y = user_movie_drop[columns], user_movie_drop['rating']
lr = LinearRegression()
lr.fit(X, y)
print('R-squared: {:.4f}'.format(lr.score(X, y)))

R-squared: 0.1801


In [106]:
movieId_list = user_movie_drop.movieId.unique()
userId_list = user_movie_drop.userId.unique()

In [108]:
columns = ["userId"]+list(movieId_list)

In [136]:
df = pd.DataFrame({"userId" : userId_list})

In [137]:
for movieId in list(movieId_list):
    df[movieId] = np.nan

In [138]:
rating_list = user_movie_drop[["userId", "movieId", "rating"]].to_dict('records')

In [139]:
df.head()

Unnamed: 0,userId,858,1246,2959,5,25,58,64,79,141,...,579,576,3902,96239,676,7270,8069,77883,40494,2503
0,1,,,,,,,,,,...,,,,,,,,,,
1,2,,,,,,,,,,...,,,,,,,,,,
2,3,,,,,,,,,,...,,,,,,,,,,
3,4,,,,,,,,,,...,,,,,,,,,,
4,5,,,,,,,,,,...,,,,,,,,,,


In [140]:
rating_list

[{'userId': 1, 'movieId': 858, 'rating': 5.0},
 {'userId': 1, 'movieId': 1246, 'rating': 5.0},
 {'userId': 1, 'movieId': 2959, 'rating': 4.0},
 {'userId': 2, 'movieId': 5, 'rating': 3.0},
 {'userId': 2, 'movieId': 25, 'rating': 3.0},
 {'userId': 2, 'movieId': 58, 'rating': 3.0},
 {'userId': 2, 'movieId': 64, 'rating': 4.0},
 {'userId': 2, 'movieId': 79, 'rating': 4.0},
 {'userId': 2, 'movieId': 141, 'rating': 3.0},
 {'userId': 2, 'movieId': 339, 'rating': 5.0},
 {'userId': 2, 'movieId': 377, 'rating': 4.0},
 {'userId': 2, 'movieId': 605, 'rating': 4.0},
 {'userId': 2, 'movieId': 628, 'rating': 4.0},
 {'userId': 2, 'movieId': 762, 'rating': 3.0},
 {'userId': 2, 'movieId': 786, 'rating': 1.0},
 {'userId': 2, 'movieId': 788, 'rating': 1.0},
 {'userId': 3, 'movieId': 500, 'rating': 2.0},
 {'userId': 3, 'movieId': 858, 'rating': 4.0},
 {'userId': 4, 'movieId': 223, 'rating': 4.0},
 {'userId': 4, 'movieId': 415, 'rating': 4.0},
 {'userId': 4, 'movieId': 1422, 'rating': 4.0},
 {'userId': 4, '

In [141]:
for rating in rating_list:
    df.loc[df['userId'] == rating["userId"], [rating["movieId"]]] = rating['rating']

In [142]:
df

Unnamed: 0,userId,858,1246,2959,5,25,58,64,79,141,...,579,576,3902,96239,676,7270,8069,77883,40494,2503
0,1,5.0,5.0,4.0,,,,,,,...,,,,,,,,,,
1,2,,,,3.0,3.0,3.0,4.0,4.0,3.0,...,,,,,,,,,,
2,3,4.0,,,,,,,,,...,,,,,,,,,,
3,4,,,,,,,,,,...,,,,,,,,,,
4,5,5.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,6036,,,4.5,,,,,,,...,,,,,,,,,,
5568,6037,,,,,,,,,5.0,...,,,,,,,,,,
5569,6038,,,,3.0,3.0,,,,5.0,...,,,,,,,,,,
5570,6039,,,,,,,,3.0,3.0,...,,,,,,,,,,


In [152]:
user_movie_drop.movieId.value_counts()

296     2048
2959    1396
858     1359
150     1343
592     1263
        ... 
8584       1
8944       1
9008       1
820        1
3902       1
Name: movieId, Length: 1309, dtype: int64

In [143]:
df[296].value_counts()

5.0    846
4.0    490
4.5    269
3.0    208
3.5    111
2.0     47
1.0     33
2.5     22
0.5     12
1.5     10
Name: 296, dtype: int64

In [149]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df_drop = df.dropna(subset=[296])
columns = df_drop.columns.to_list()
columns.remove(296)
X, y = imputer.fit_transform(df_drop[columns]), df_drop[296]
lr = LinearRegression()
lr.fit(X, y)
print('R-squared: {:.4f}'.format(lr.score(X, y)))

R-squared: 0.6406


In [153]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df_drop = df.dropna(subset=[2959])
columns = df_drop.columns.to_list()
columns.remove(2959)
X, y = imputer.fit_transform(df_drop[columns]), df_drop[2959]
lr = LinearRegression()
lr.fit(X, y)
print('R-squared: {:.4f}'.format(lr.score(X, y)))

R-squared: 0.7059


In [154]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df_drop = df.dropna(subset=[858])
columns = df_drop.columns.to_list()
columns.remove(858)
X, y = imputer.fit_transform(df_drop[columns]), df_drop[858]
lr = LinearRegression()
lr.fit(X, y)
print('R-squared: {:.4f}'.format(lr.score(X, y)))

R-squared: 0.7418


In [156]:
len(df.columns)

1310