<a href="https://colab.research.google.com/github/vassilyf/ml/blob/main/REC_homework_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import os
import re
from google.colab import files
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm_notebook as tqdm

import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score

In [2]:
uploaded = files.upload()

Saving ml-latest-small.zip to ml-latest-small.zip


In [3]:
!unzip ml-latest-small.zip

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [4]:
prefix = 'ml-latest-small'
links = pd.read_csv(os.path.join(prefix, 'links.csv'))
movies = pd.read_csv(os.path.join(prefix, 'movies.csv'))
ratings = pd.read_csv(os.path.join(prefix, 'ratings.csv'))
tags = pd.read_csv(os.path.join(prefix, 'tags.csv'))

In [7]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [8]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [13]:
ratings['userId'].value_counts()[:20]

414    2698
599    2478
474    2108
448    1864
274    1346
610    1302
68     1260
380    1218
606    1115
288    1055
249    1046
387    1027
182     977
307     975
603     943
298     939
177     904
318     879
232     862
480     836
Name: userId, dtype: int64

In [10]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [18]:
movies[movies['movieId'] == 296]

Unnamed: 0,movieId,title,genres
257,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller


In [6]:
def normalize(s):
    return re.sub('[^A-Za-z0-9|]+', '', str(s)).replace('|', ' ').lower()

In [44]:
normalize('asdasd sAdadas| dd')

'asdasdsadadas|dd'

In [32]:
movie_genres = [normalize(g) for g in movies.genres.values]
movie_genres[:10]

['adventure|animation|children|comedy|fantasy',
 'adventure|children|fantasy',
 'comedy|romance',
 'comedy|drama|romance',
 'comedy',
 'action|crime|thriller',
 'comedy|romance',
 'adventure|children',
 'action',
 'action|adventure|thriller']

In [11]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(movie_genres)
pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf.get_feature_names_out())

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.000000,0.482990,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.000000,0.512361,0.000000,0.620525,0.000000,0.0,0.0,0.000000,0.593662,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.570915,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.505015,0.0,0.0,0.466405,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0.436010,0.000000,0.614603,0.000000,0.318581,0.0,0.0,0.000000,0.575034,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9738,0.000000,0.000000,0.682937,0.000000,0.354002,0.0,0.0,0.000000,0.638968,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9739,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9740,0.578606,0.000000,0.815607,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [27]:
tags_grouped = tags.copy()
tags_grouped['tag'] = tags_grouped[['movieId', 'tag']].groupby('movieId')['tag'].transform(lambda x: ' '.join(map(lambda s: normalize(s), x)))
tags_grouped = tags_grouped[['movieId', 'tag']].drop_duplicates()

movies_with_tags_rating = ratings[['movieId', 'userId', 'rating']] \
    .merge(movies[['movieId', 'genres']], on='movieId') \
    .merge(tags_grouped, on='movieId')

movies_with_tags_rating['genres'] = movies_with_tags_rating['genres'].transform(lambda s: normalize(s))

movies_with_tags_rating.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 48287 entries, 0 to 48286
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  48287 non-null  int64  
 1   userId   48287 non-null  int64  
 2   rating   48287 non-null  float64
 3   genres   48287 non-null  object 
 4   tag      48287 non-null  object 
dtypes: float64(1), int64(2), object(2)
memory usage: 2.2+ MB


In [44]:
movie_genres = movies_with_tags_rating['genres'].values
tfidf = TfidfVectorizer()
X_tfidf_genres = tfidf.fit_transform(movie_genres)
genres_df = pd.DataFrame(X_tfidf_genres.toarray(), columns=tfidf.get_feature_names_out())
tfidf = TfidfVectorizer()
movie_tags = movies_with_tags_rating['tag'].values
X_tfidf_tags = tfidf.fit_transform(movie_tags)
tags_df = pd.DataFrame(X_tfidf_tags.toarray(), columns=tfidf.get_feature_names_out())
#pd.merge_ordered(movies_with_tags_rating, genres_df)
movies_for_regression = pd.concat([
    #movies_with_tags_rating[['movieId', 'userId', 'rating']].reset_index(drop=True),
    movies_with_tags_rating[['userId', 'rating']].reset_index(drop=True),
    genres_df.reset_index(drop=True),
    tags_df.reset_index(drop=True)], axis=1)

print(len(movies_for_regression.columns.tolist()))
movies_for_regression.info()

1478
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48287 entries, 0 to 48286
Columns: 1478 entries, userId to zooeydeschanel
dtypes: float64(1477), int64(1)
memory usage: 544.5 MB


In [38]:
def run_fit(_df, target_name):
  X_train, X_test, y_train, y_test = train_test_split(_df.drop(columns=[target_name]), _df[target_name], test_size=0.2, random_state=42)

  #model = DecisionTreeRegressor()
  model = make_pipeline(
      StandardScaler(),
      LinearRegression()
  )
  model.fit(X_train, y_train)
  y_pred_on_train = model.predict(X_train)
  y_pred_on_test = model.predict(X_test)
  print(f"model score={model.score(X_test, y_test)}")
  print(f"RMSE score, train={np.sqrt(mean_squared_error(y_train, y_pred_on_train))}, test={np.sqrt(mean_squared_error(y_test, y_pred_on_test))}")

In [42]:
movies_one_user = movies_for_regression[movies_for_regression['userId'] == 448] # 599
movies_one_user.head()

run_fit(movies_one_user, "rating")


model score=-4.438404301912769e+25
RMSE score, train=0.25114774027669184, test=6674050408272.262
