<a href="https://colab.research.google.com/github/virf96/Basico/blob/main/Sistema_de_recomendaci%C3%B3n_(KNN___NB).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recommender system

## Preparación de ambiente

### Carga de módulos

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

pd.set_option('display.float_format', lambda x: '%.2f' % x)

### Funciones relevantes

In [2]:
def get_dim(param_grid):
    return np.prod([x for x in map(len, param_grid.values())])

## Preparación de datos

### Carga de datos

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
movies = pd.read_csv("/content/drive/MyDrive/datasets/movies.csv")
links = pd.read_csv("/content/drive/MyDrive/datasets/links.csv")
ratings = pd.read_csv("/content/drive/MyDrive/datasets/ratings.csv")
tags = pd.read_csv("/content/drive/MyDrive/datasets/tags.csv")

### EDA

In [5]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [6]:
for dataset in [movies, links, ratings, tags]:
    print(dataset.shape)
    display(dataset.sample(5))

(9742, 3)


Unnamed: 0,movieId,title,genres
4586,6812,"Rookie, The (1990)",Action|Comedy|Thriller
3339,4522,Masquerade (1988),Mystery|Romance|Thriller
5732,30707,Million Dollar Baby (2004),Drama
8891,134515,BMX Bandits (1983),Adventure|Crime|Drama
7529,84615,Cedar Rapids (2011),Comedy


(9742, 3)


Unnamed: 0,movieId,imdbId,tmdbId
7147,71468,1071804,24869.0
7160,71732,902290,21910.0
4805,7160,340855,504.0
5939,34164,361693,14922.0
2793,3735,70666,9040.0


(100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
19288,125,4643,4.0,1475407893
96446,603,1350,3.0,953925657
79668,495,54503,5.0,1458635508
1150,10,6377,3.5,1455306101
48046,312,750,5.0,1043176553


(3683, 4)


Unnamed: 0,userId,movieId,tag,timestamp
2122,474,6337,gambling,1138307244
2246,474,7086,George Bernard Shaw,1137201703
1679,474,2750,nostalgia,1137443255
2084,474,6195,books,1138498655
3514,599,296,Quotable,1498456446


In [7]:
links

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.00
1,2,113497,8844.00
2,3,113228,15602.00
3,4,114885,31357.00
4,5,113041,11862.00
...,...,...,...
9737,193581,5476944,432131.00
9738,193583,5914996,445030.00
9739,193585,6397426,479308.00
9740,193587,8391976,483455.00


### Ingeniería de datos

#### Rating global

In [8]:
ratings[["userId", "rating"]].groupby("userId").agg(["min", "max", "mean"])

Unnamed: 0_level_0,rating,rating,rating
Unnamed: 0_level_1,min,max,mean
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,1.00,5.00,4.37
2,2.00,5.00,3.95
3,0.50,5.00,2.44
4,1.00,5.00,3.56
5,1.00,5.00,3.64
...,...,...,...
606,0.50,5.00,3.66
607,1.00,5.00,3.79
608,0.50,5.00,3.13
609,3.00,4.00,3.27


In [9]:
aux = ratings[["userId", "rating"]].groupby("userId").agg(["min", "max", "mean"])


In [10]:
["_".join(x) for x in aux.columns]

['rating_min', 'rating_max', 'rating_mean']

In [11]:
aux.columns = ["_".join(x) for x in aux.columns]


In [12]:
ratings = ratings.merge(aux.reset_index(), on="userId", how = "left")

In [13]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp,rating_min,rating_max,rating_mean
0,1,1,4.00,964982703,1.00,5.00,4.37
1,1,3,4.00,964981247,1.00,5.00,4.37
2,1,6,4.00,964982224,1.00,5.00,4.37
3,1,47,5.00,964983815,1.00,5.00,4.37
4,1,50,5.00,964982931,1.00,5.00,4.37
...,...,...,...,...,...,...,...
100831,610,166534,4.00,1493848402,0.50,5.00,3.69
100832,610,168248,5.00,1493850091,0.50,5.00,3.69
100833,610,168250,5.00,1494273047,0.50,5.00,3.69
100834,610,168252,5.00,1493846352,0.50,5.00,3.69


#### Tags

In [14]:
tags = tags[["movieId", "tag"]].groupby("movieId").agg([lambda x:" ".join(x)])

In [15]:
tags.sample(5)

Unnamed: 0_level_0,tag
Unnamed: 0_level_1,<lambda>
movieId,Unnamed: 1_level_2
8011,politics
6667,FBI
2054,Disney
1994,ghosts
318,prison Stephen King wrongful imprisonment Morg...


In [16]:
tags.columns = ["tags"]

In [17]:
vect = CountVectorizer(min_df=1, lowercase=True, max_features=30)

In [18]:
vect.fit(tags["tags"])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=30, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [19]:
tags = tags.drop(columns=["tags"]).reset_index().join(pd.DataFrame(data = vect.transform(tags["tags"]).todense(), columns=vect.get_feature_names()))

In [20]:
tags

Unnamed: 0,movieId,action,and,atmospheric,bad,black,comedy,crime,dark,disney,ending,fi,funny,in,movie,music,netflix,provoking,queue,quirky,religion,sci,space,superhero,surreal,suspense,thought,travel,twist,visually,war
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1567,183611,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1568,184471,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1569,187593,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1570,187595,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Movies (géneros)

In [21]:
movies["ls_genres"] = movies["genres"].str.split("|")
movies["n_genres"] = movies["ls_genres"].str.len()

In [22]:
movies["n_genres"]

0       5
1       3
2       2
3       3
4       1
       ..
9737    4
9738    3
9739    1
9740    2
9741    1
Name: n_genres, Length: 9742, dtype: int64

In [23]:
for i in range(movies["n_genres"].max()):
    movies[f"genre_{i}"] = movies.apply(lambda x:x["ls_genres"][i] if x["n_genres"] >= i+1 else np.nan, axis = 1)
    aux = movies[["movieId", f"genre_{i}"]].rename(columns={f"genre_{i}": "genre"})
    if i == 0:
        movies_genre = aux.copy()
    else:
        movies_genre = movies_genre.append(aux)

In [24]:
movies_genre.sort_values(by="movieId").head(20)

Unnamed: 0,movieId,genre
0,1,Adventure
0,1,Fantasy
0,1,
0,1,
0,1,Comedy
0,1,
0,1,
0,1,
0,1,Animation
0,1,Children


In [25]:
movies_genre.sort_values(by = "movieId").dropna()

Unnamed: 0,movieId,genre
0,1,Adventure
0,1,Fantasy
0,1,Comedy
0,1,Animation
0,1,Children
...,...,...
9738,193583,Fantasy
9739,193585,Drama
9740,193587,Animation
9740,193587,Action


In [26]:
mgr = movies_genre.dropna().merge(ratings, on="movieId", how="right")

In [27]:
mgr

Unnamed: 0,movieId,genre,userId,rating,timestamp,rating_min,rating_max,rating_mean
0,1,Adventure,1,4.00,964982703,1.00,5.00,4.37
1,1,Animation,1,4.00,964982703,1.00,5.00,4.37
2,1,Children,1,4.00,964982703,1.00,5.00,4.37
3,1,Comedy,1,4.00,964982703,1.00,5.00,4.37
4,1,Fantasy,1,4.00,964982703,1.00,5.00,4.37
...,...,...,...,...,...,...,...,...
274475,160836,Drama,610,3.00,1493844794,0.50,5.00,3.69
274476,160836,Thriller,610,3.00,1493844794,0.50,5.00,3.69
274477,163937,Horror,610,3.50,1493848789,0.50,5.00,3.69
274478,163937,Thriller,610,3.50,1493848789,0.50,5.00,3.69


In [28]:
mgr = mgr[["userId", "genre", "rating"]].pivot_table(index="userId", columns="genre", aggfunc=["min", "max", "mean"])

In [29]:
mgr

Unnamed: 0_level_0,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
Unnamed: 0_level_1,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
genre,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
userId,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3,Unnamed: 24_level_3,Unnamed: 25_level_3,Unnamed: 26_level_3,Unnamed: 27_level_3,Unnamed: 28_level_3,Unnamed: 29_level_3,Unnamed: 30_level_3,Unnamed: 31_level_3,Unnamed: 32_level_3,Unnamed: 33_level_3,Unnamed: 34_level_3,Unnamed: 35_level_3,Unnamed: 36_level_3,Unnamed: 37_level_3,Unnamed: 38_level_3,Unnamed: 39_level_3,Unnamed: 40_level_3,Unnamed: 41_level_3,Unnamed: 42_level_3,Unnamed: 43_level_3,Unnamed: 44_level_3,Unnamed: 45_level_3,Unnamed: 46_level_3,Unnamed: 47_level_3,Unnamed: 48_level_3,Unnamed: 49_level_3,Unnamed: 50_level_3,Unnamed: 51_level_3,Unnamed: 52_level_3,Unnamed: 53_level_3,Unnamed: 54_level_3,Unnamed: 55_level_3,Unnamed: 56_level_3,Unnamed: 57_level_3,Unnamed: 58_level_3,Unnamed: 59_level_3,Unnamed: 60_level_3
1,,2.00,2.00,3.00,3.00,2.00,2.00,,1.00,2.00,5.00,2.00,,3.00,1.00,3.00,3.00,1.00,3.00,3.00,,5.00,5.00,5.00,5.00,5.00,5.00,,5.00,5.00,5.00,5.00,,5.00,5.00,5.00,5.00,5.00,5.00,5.00,,4.32,4.39,4.69,4.55,4.28,4.36,,4.53,4.30,5.00,3.47,,4.68,4.17,4.31,4.22,4.15,4.50,4.29
2,,3.00,3.50,,,3.00,2.00,3.00,2.00,,,3.00,3.00,,4.00,4.50,3.00,2.00,4.50,3.50,,5.00,5.00,,,5.00,5.00,5.00,5.00,,,3.00,4.50,,4.00,4.50,5.00,5.00,4.50,3.50,,3.95,4.17,,,4.00,3.80,4.33,3.88,,,3.00,3.75,,4.00,4.50,3.88,3.70,4.50,3.50
3,,0.50,0.50,0.50,0.50,0.50,0.50,,0.50,0.50,,4.00,,0.50,5.00,0.50,0.50,0.50,0.50,,,5.00,5.00,0.50,0.50,5.00,0.50,,4.50,5.00,,5.00,,0.50,5.00,0.50,5.00,5.00,0.50,,,3.57,2.73,0.50,0.50,1.00,0.50,,0.75,3.38,,4.69,,0.50,5.00,0.50,4.20,4.14,0.50,
4,,1.00,1.00,3.00,1.00,1.00,1.00,3.00,1.00,1.00,2.00,4.00,3.00,1.00,1.00,1.00,1.00,1.00,1.00,2.00,,5.00,5.00,5.00,5.00,5.00,5.00,5.00,5.00,5.00,5.00,5.00,3.00,5.00,5.00,5.00,5.00,5.00,5.00,5.00,,3.32,3.66,4.00,3.80,3.51,3.81,4.00,3.48,3.68,4.00,4.25,3.00,4.00,3.48,3.38,2.83,3.55,3.57,3.80
5,,2.00,2.00,3.00,3.00,2.00,3.00,,1.00,3.00,,3.00,3.00,3.00,4.00,1.00,2.00,2.00,1.00,1.00,,4.00,5.00,5.00,5.00,5.00,5.00,,5.00,5.00,,3.00,5.00,5.00,4.00,5.00,3.00,5.00,5.00,5.00,,3.11,3.25,4.33,4.11,3.47,3.83,,3.80,4.14,,3.00,3.67,4.40,4.00,3.09,2.50,3.56,3.33,3.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,0.50,1.50,2.50,1.50,0.50,0.50,3.00,0.50,2.00,3.00,1.00,2.00,2.50,2.00,1.00,1.50,0.50,2.00,2.50,,5.00,5.00,5.00,4.50,5.00,5.00,4.00,5.00,5.00,4.50,4.50,4.00,5.00,5.00,5.00,5.00,5.00,5.00,4.50,,3.18,3.50,3.71,3.45,3.57,3.65,3.80,3.79,3.60,3.81,3.35,3.06,3.73,3.79,3.74,3.56,3.53,3.79,3.41
607,,1.00,1.00,2.00,2.00,1.00,2.00,,2.00,3.00,,2.00,5.00,2.00,4.00,2.00,1.00,1.00,2.00,4.00,,5.00,5.00,5.00,5.00,5.00,5.00,,5.00,5.00,,5.00,5.00,5.00,5.00,5.00,5.00,5.00,5.00,4.00,,3.72,3.47,3.33,3.42,3.33,3.81,,4.01,3.57,,4.11,5.00,3.60,4.65,3.52,3.25,4.11,4.17,4.00
608,,0.50,0.50,0.50,0.50,0.50,0.50,2.00,0.50,0.50,2.00,1.00,2.00,0.50,0.50,0.50,0.50,0.50,1.50,1.00,,5.00,5.00,4.50,4.50,5.00,5.00,4.00,5.00,5.00,5.00,5.00,5.00,4.50,5.00,5.00,5.00,5.00,5.00,4.00,,3.33,3.22,3.12,2.46,2.74,3.61,3.00,3.44,3.00,3.75,3.32,4.00,2.76,3.55,2.89,3.30,3.54,3.58,2.64
609,,3.00,3.00,3.00,3.00,3.00,3.00,3.00,3.00,3.00,,3.00,3.00,,,3.00,3.00,3.00,3.00,4.00,,4.00,4.00,3.00,3.00,4.00,4.00,3.00,4.00,3.00,,4.00,3.00,,,4.00,3.00,4.00,4.00,4.00,,3.09,3.20,3.00,3.00,3.29,3.50,3.00,3.37,3.00,,3.50,3.00,,,3.20,3.00,3.29,3.50,4.00


In [30]:
mgr.columns = ["_".join(x) for x in mgr.columns]

In [31]:
mgr

Unnamed: 0_level_0,min_rating_(no genres listed),min_rating_Action,min_rating_Adventure,min_rating_Animation,min_rating_Children,min_rating_Comedy,min_rating_Crime,min_rating_Documentary,min_rating_Drama,min_rating_Fantasy,min_rating_Film-Noir,min_rating_Horror,min_rating_IMAX,min_rating_Musical,min_rating_Mystery,min_rating_Romance,min_rating_Sci-Fi,min_rating_Thriller,min_rating_War,min_rating_Western,max_rating_(no genres listed),max_rating_Action,max_rating_Adventure,max_rating_Animation,max_rating_Children,max_rating_Comedy,max_rating_Crime,max_rating_Documentary,max_rating_Drama,max_rating_Fantasy,max_rating_Film-Noir,max_rating_Horror,max_rating_IMAX,max_rating_Musical,max_rating_Mystery,max_rating_Romance,max_rating_Sci-Fi,max_rating_Thriller,max_rating_War,max_rating_Western,mean_rating_(no genres listed),mean_rating_Action,mean_rating_Adventure,mean_rating_Animation,mean_rating_Children,mean_rating_Comedy,mean_rating_Crime,mean_rating_Documentary,mean_rating_Drama,mean_rating_Fantasy,mean_rating_Film-Noir,mean_rating_Horror,mean_rating_IMAX,mean_rating_Musical,mean_rating_Mystery,mean_rating_Romance,mean_rating_Sci-Fi,mean_rating_Thriller,mean_rating_War,mean_rating_Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1
1,,2.00,2.00,3.00,3.00,2.00,2.00,,1.00,2.00,5.00,2.00,,3.00,1.00,3.00,3.00,1.00,3.00,3.00,,5.00,5.00,5.00,5.00,5.00,5.00,,5.00,5.00,5.00,5.00,,5.00,5.00,5.00,5.00,5.00,5.00,5.00,,4.32,4.39,4.69,4.55,4.28,4.36,,4.53,4.30,5.00,3.47,,4.68,4.17,4.31,4.22,4.15,4.50,4.29
2,,3.00,3.50,,,3.00,2.00,3.00,2.00,,,3.00,3.00,,4.00,4.50,3.00,2.00,4.50,3.50,,5.00,5.00,,,5.00,5.00,5.00,5.00,,,3.00,4.50,,4.00,4.50,5.00,5.00,4.50,3.50,,3.95,4.17,,,4.00,3.80,4.33,3.88,,,3.00,3.75,,4.00,4.50,3.88,3.70,4.50,3.50
3,,0.50,0.50,0.50,0.50,0.50,0.50,,0.50,0.50,,4.00,,0.50,5.00,0.50,0.50,0.50,0.50,,,5.00,5.00,0.50,0.50,5.00,0.50,,4.50,5.00,,5.00,,0.50,5.00,0.50,5.00,5.00,0.50,,,3.57,2.73,0.50,0.50,1.00,0.50,,0.75,3.38,,4.69,,0.50,5.00,0.50,4.20,4.14,0.50,
4,,1.00,1.00,3.00,1.00,1.00,1.00,3.00,1.00,1.00,2.00,4.00,3.00,1.00,1.00,1.00,1.00,1.00,1.00,2.00,,5.00,5.00,5.00,5.00,5.00,5.00,5.00,5.00,5.00,5.00,5.00,3.00,5.00,5.00,5.00,5.00,5.00,5.00,5.00,,3.32,3.66,4.00,3.80,3.51,3.81,4.00,3.48,3.68,4.00,4.25,3.00,4.00,3.48,3.38,2.83,3.55,3.57,3.80
5,,2.00,2.00,3.00,3.00,2.00,3.00,,1.00,3.00,,3.00,3.00,3.00,4.00,1.00,2.00,2.00,1.00,1.00,,4.00,5.00,5.00,5.00,5.00,5.00,,5.00,5.00,,3.00,5.00,5.00,4.00,5.00,3.00,5.00,5.00,5.00,,3.11,3.25,4.33,4.11,3.47,3.83,,3.80,4.14,,3.00,3.67,4.40,4.00,3.09,2.50,3.56,3.33,3.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,0.50,1.50,2.50,1.50,0.50,0.50,3.00,0.50,2.00,3.00,1.00,2.00,2.50,2.00,1.00,1.50,0.50,2.00,2.50,,5.00,5.00,5.00,4.50,5.00,5.00,4.00,5.00,5.00,4.50,4.50,4.00,5.00,5.00,5.00,5.00,5.00,5.00,4.50,,3.18,3.50,3.71,3.45,3.57,3.65,3.80,3.79,3.60,3.81,3.35,3.06,3.73,3.79,3.74,3.56,3.53,3.79,3.41
607,,1.00,1.00,2.00,2.00,1.00,2.00,,2.00,3.00,,2.00,5.00,2.00,4.00,2.00,1.00,1.00,2.00,4.00,,5.00,5.00,5.00,5.00,5.00,5.00,,5.00,5.00,,5.00,5.00,5.00,5.00,5.00,5.00,5.00,5.00,4.00,,3.72,3.47,3.33,3.42,3.33,3.81,,4.01,3.57,,4.11,5.00,3.60,4.65,3.52,3.25,4.11,4.17,4.00
608,,0.50,0.50,0.50,0.50,0.50,0.50,2.00,0.50,0.50,2.00,1.00,2.00,0.50,0.50,0.50,0.50,0.50,1.50,1.00,,5.00,5.00,4.50,4.50,5.00,5.00,4.00,5.00,5.00,5.00,5.00,5.00,4.50,5.00,5.00,5.00,5.00,5.00,4.00,,3.33,3.22,3.12,2.46,2.74,3.61,3.00,3.44,3.00,3.75,3.32,4.00,2.76,3.55,2.89,3.30,3.54,3.58,2.64
609,,3.00,3.00,3.00,3.00,3.00,3.00,3.00,3.00,3.00,,3.00,3.00,,,3.00,3.00,3.00,3.00,4.00,,4.00,4.00,3.00,3.00,4.00,4.00,3.00,4.00,3.00,,4.00,3.00,,,4.00,3.00,4.00,4.00,4.00,,3.09,3.20,3.00,3.00,3.29,3.50,3.00,3.37,3.00,,3.50,3.00,,,3.20,3.00,3.29,3.50,4.00


In [32]:
aux = mgr.isnull().mean().sort_values().reset_index()

In [33]:
aux

Unnamed: 0,index,0
0,mean_rating_Drama,0.0
1,max_rating_Drama,0.0
2,min_rating_Drama,0.0
3,min_rating_Thriller,0.0
4,mean_rating_Thriller,0.0
5,max_rating_Thriller,0.0
6,max_rating_Comedy,0.0
7,mean_rating_Comedy,0.0
8,min_rating_Comedy,0.0
9,mean_rating_Action,0.0


In [34]:
ls_keep = aux.loc[aux[0] <= 0.35, "index"].tolist()

In [35]:
mgr = mgr[ls_keep]

In [36]:
ratings = ratings.merge(mgr.reset_index(), on="userId", how = "left")

In [37]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp,rating_min,rating_max,rating_mean,mean_rating_Drama,max_rating_Drama,min_rating_Drama,min_rating_Thriller,mean_rating_Thriller,max_rating_Thriller,max_rating_Comedy,mean_rating_Comedy,min_rating_Comedy,mean_rating_Action,max_rating_Action,min_rating_Action,max_rating_Adventure,mean_rating_Romance,min_rating_Adventure,min_rating_Romance,mean_rating_Adventure,max_rating_Romance,min_rating_Sci-Fi,max_rating_Sci-Fi,mean_rating_Sci-Fi,mean_rating_Crime,max_rating_Crime,min_rating_Crime,mean_rating_Fantasy,max_rating_Fantasy,min_rating_Fantasy,mean_rating_Mystery,max_rating_Mystery,min_rating_Mystery,mean_rating_Children,min_rating_Children,max_rating_Children,min_rating_War,mean_rating_War,max_rating_War,max_rating_Horror,mean_rating_Horror,min_rating_Horror,min_rating_Animation,mean_rating_Animation,max_rating_Animation,max_rating_Musical,mean_rating_Musical,min_rating_Musical,max_rating_IMAX,mean_rating_IMAX,min_rating_IMAX,mean_rating_Western,max_rating_Western,min_rating_Western
0,1,1,4.00,964982703,1.00,5.00,4.37,4.53,5.00,1.00,1.00,4.15,5.00,5.00,4.28,2.00,4.32,5.00,2.00,5.00,4.31,2.00,3.00,4.39,5.00,3.00,5.00,4.22,4.36,5.00,2.00,4.30,5.00,2.00,4.17,5.00,1.00,4.55,3.00,5.00,3.00,4.50,5.00,5.00,3.47,2.00,3.00,4.69,5.00,5.00,4.68,3.00,,,,4.29,5.00,3.00
1,1,3,4.00,964981247,1.00,5.00,4.37,4.53,5.00,1.00,1.00,4.15,5.00,5.00,4.28,2.00,4.32,5.00,2.00,5.00,4.31,2.00,3.00,4.39,5.00,3.00,5.00,4.22,4.36,5.00,2.00,4.30,5.00,2.00,4.17,5.00,1.00,4.55,3.00,5.00,3.00,4.50,5.00,5.00,3.47,2.00,3.00,4.69,5.00,5.00,4.68,3.00,,,,4.29,5.00,3.00
2,1,6,4.00,964982224,1.00,5.00,4.37,4.53,5.00,1.00,1.00,4.15,5.00,5.00,4.28,2.00,4.32,5.00,2.00,5.00,4.31,2.00,3.00,4.39,5.00,3.00,5.00,4.22,4.36,5.00,2.00,4.30,5.00,2.00,4.17,5.00,1.00,4.55,3.00,5.00,3.00,4.50,5.00,5.00,3.47,2.00,3.00,4.69,5.00,5.00,4.68,3.00,,,,4.29,5.00,3.00
3,1,47,5.00,964983815,1.00,5.00,4.37,4.53,5.00,1.00,1.00,4.15,5.00,5.00,4.28,2.00,4.32,5.00,2.00,5.00,4.31,2.00,3.00,4.39,5.00,3.00,5.00,4.22,4.36,5.00,2.00,4.30,5.00,2.00,4.17,5.00,1.00,4.55,3.00,5.00,3.00,4.50,5.00,5.00,3.47,2.00,3.00,4.69,5.00,5.00,4.68,3.00,,,,4.29,5.00,3.00
4,1,50,5.00,964982931,1.00,5.00,4.37,4.53,5.00,1.00,1.00,4.15,5.00,5.00,4.28,2.00,4.32,5.00,2.00,5.00,4.31,2.00,3.00,4.39,5.00,3.00,5.00,4.22,4.36,5.00,2.00,4.30,5.00,2.00,4.17,5.00,1.00,4.55,3.00,5.00,3.00,4.50,5.00,5.00,3.47,2.00,3.00,4.69,5.00,5.00,4.68,3.00,,,,4.29,5.00,3.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,166534,4.00,1493848402,0.50,5.00,3.69,3.87,5.00,1.00,0.50,3.57,5.00,5.00,3.73,1.00,3.60,5.00,0.50,5.00,3.73,0.50,1.50,3.71,5.00,1.00,5.00,3.66,3.80,5.00,1.00,3.59,5.00,0.50,3.77,5.00,1.50,3.65,2.00,5.00,2.00,3.78,5.00,5.00,3.51,1.00,2.50,3.90,5.00,5.00,3.93,2.50,5.00,3.63,2.00,3.74,5.00,1.00
100832,610,168248,5.00,1493850091,0.50,5.00,3.69,3.87,5.00,1.00,0.50,3.57,5.00,5.00,3.73,1.00,3.60,5.00,0.50,5.00,3.73,0.50,1.50,3.71,5.00,1.00,5.00,3.66,3.80,5.00,1.00,3.59,5.00,0.50,3.77,5.00,1.50,3.65,2.00,5.00,2.00,3.78,5.00,5.00,3.51,1.00,2.50,3.90,5.00,5.00,3.93,2.50,5.00,3.63,2.00,3.74,5.00,1.00
100833,610,168250,5.00,1494273047,0.50,5.00,3.69,3.87,5.00,1.00,0.50,3.57,5.00,5.00,3.73,1.00,3.60,5.00,0.50,5.00,3.73,0.50,1.50,3.71,5.00,1.00,5.00,3.66,3.80,5.00,1.00,3.59,5.00,0.50,3.77,5.00,1.50,3.65,2.00,5.00,2.00,3.78,5.00,5.00,3.51,1.00,2.50,3.90,5.00,5.00,3.93,2.50,5.00,3.63,2.00,3.74,5.00,1.00
100834,610,168252,5.00,1493846352,0.50,5.00,3.69,3.87,5.00,1.00,0.50,3.57,5.00,5.00,3.73,1.00,3.60,5.00,0.50,5.00,3.73,0.50,1.50,3.71,5.00,1.00,5.00,3.66,3.80,5.00,1.00,3.59,5.00,0.50,3.77,5.00,1.50,3.65,2.00,5.00,2.00,3.78,5.00,5.00,3.51,1.00,2.50,3.90,5.00,5.00,3.93,2.50,5.00,3.63,2.00,3.74,5.00,1.00


#### TAD

In [38]:
df = ratings.drop(columns=["timestamp"]).merge(tags, on = "movieId", how ="left")
del movies, links, ratings, tags

In [39]:
df.head(5)

Unnamed: 0,userId,movieId,rating,rating_min,rating_max,rating_mean,mean_rating_Drama,max_rating_Drama,min_rating_Drama,min_rating_Thriller,mean_rating_Thriller,max_rating_Thriller,max_rating_Comedy,mean_rating_Comedy,min_rating_Comedy,mean_rating_Action,max_rating_Action,min_rating_Action,max_rating_Adventure,mean_rating_Romance,min_rating_Adventure,min_rating_Romance,mean_rating_Adventure,max_rating_Romance,min_rating_Sci-Fi,max_rating_Sci-Fi,mean_rating_Sci-Fi,mean_rating_Crime,max_rating_Crime,min_rating_Crime,mean_rating_Fantasy,max_rating_Fantasy,min_rating_Fantasy,mean_rating_Mystery,max_rating_Mystery,min_rating_Mystery,mean_rating_Children,min_rating_Children,max_rating_Children,min_rating_War,...,max_rating_Animation,max_rating_Musical,mean_rating_Musical,min_rating_Musical,max_rating_IMAX,mean_rating_IMAX,min_rating_IMAX,mean_rating_Western,max_rating_Western,min_rating_Western,action,and,atmospheric,bad,black,comedy,crime,dark,disney,ending,fi,funny,in,movie,music,netflix,provoking,queue,quirky,religion,sci,space,superhero,surreal,suspense,thought,travel,twist,visually,war
0,1,1,4.0,1.0,5.0,4.37,4.53,5.0,1.0,1.0,4.15,5.0,5.0,4.28,2.0,4.32,5.0,2.0,5.0,4.31,2.0,3.0,4.39,5.0,3.0,5.0,4.22,4.36,5.0,2.0,4.3,5.0,2.0,4.17,5.0,1.0,4.55,3.0,5.0,3.0,...,5.0,5.0,4.68,3.0,,,,4.29,5.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,3,4.0,1.0,5.0,4.37,4.53,5.0,1.0,1.0,4.15,5.0,5.0,4.28,2.0,4.32,5.0,2.0,5.0,4.31,2.0,3.0,4.39,5.0,3.0,5.0,4.22,4.36,5.0,2.0,4.3,5.0,2.0,4.17,5.0,1.0,4.55,3.0,5.0,3.0,...,5.0,5.0,4.68,3.0,,,,4.29,5.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,6,4.0,1.0,5.0,4.37,4.53,5.0,1.0,1.0,4.15,5.0,5.0,4.28,2.0,4.32,5.0,2.0,5.0,4.31,2.0,3.0,4.39,5.0,3.0,5.0,4.22,4.36,5.0,2.0,4.3,5.0,2.0,4.17,5.0,1.0,4.55,3.0,5.0,3.0,...,5.0,5.0,4.68,3.0,,,,4.29,5.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1,47,5.0,1.0,5.0,4.37,4.53,5.0,1.0,1.0,4.15,5.0,5.0,4.28,2.0,4.32,5.0,2.0,5.0,4.31,2.0,3.0,4.39,5.0,3.0,5.0,4.22,4.36,5.0,2.0,4.3,5.0,2.0,4.17,5.0,1.0,4.55,3.0,5.0,3.0,...,5.0,5.0,4.68,3.0,,,,4.29,5.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1,50,5.0,1.0,5.0,4.37,4.53,5.0,1.0,1.0,4.15,5.0,5.0,4.28,2.0,4.32,5.0,2.0,5.0,4.31,2.0,3.0,4.39,5.0,3.0,5.0,4.22,4.36,5.0,2.0,4.3,5.0,2.0,4.17,5.0,1.0,4.55,3.0,5.0,3.0,...,5.0,5.0,4.68,3.0,,,,4.29,5.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [40]:
X = df[[x for x in df.columns if x not in ["userId", "movieId", "rating",'tags']]]
y = df[["rating"]]

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7)

#### Escalamiento de datos

In [42]:
SimpleImputer?

In [43]:
im_x = SimpleImputer(strategy="constant", fill_value=-1)

In [44]:
sc_x = MinMaxScaler()
sc_y = MinMaxScaler()

In [45]:
X_train

Unnamed: 0,rating_min,rating_max,rating_mean,mean_rating_Drama,max_rating_Drama,min_rating_Drama,min_rating_Thriller,mean_rating_Thriller,max_rating_Thriller,max_rating_Comedy,mean_rating_Comedy,min_rating_Comedy,mean_rating_Action,max_rating_Action,min_rating_Action,max_rating_Adventure,mean_rating_Romance,min_rating_Adventure,min_rating_Romance,mean_rating_Adventure,max_rating_Romance,min_rating_Sci-Fi,max_rating_Sci-Fi,mean_rating_Sci-Fi,mean_rating_Crime,max_rating_Crime,min_rating_Crime,mean_rating_Fantasy,max_rating_Fantasy,min_rating_Fantasy,mean_rating_Mystery,max_rating_Mystery,min_rating_Mystery,mean_rating_Children,min_rating_Children,max_rating_Children,min_rating_War,mean_rating_War,max_rating_War,max_rating_Horror,...,max_rating_Animation,max_rating_Musical,mean_rating_Musical,min_rating_Musical,max_rating_IMAX,mean_rating_IMAX,min_rating_IMAX,mean_rating_Western,max_rating_Western,min_rating_Western,action,and,atmospheric,bad,black,comedy,crime,dark,disney,ending,fi,funny,in,movie,music,netflix,provoking,queue,quirky,religion,sci,space,superhero,surreal,suspense,thought,travel,twist,visually,war
88568,1.00,5.00,4.12,4.19,5.00,1.00,1.00,4.18,5.00,5.00,3.88,1.00,3.91,5.00,1.00,5.00,3.98,1.00,1.00,3.97,5.00,1.00,5.00,3.93,4.35,5.00,1.00,3.29,5.00,1.00,4.29,5.00,2.00,4.00,2.00,5.00,3.00,4.55,5.00,5.00,...,5.00,3.00,2.67,2.00,4.00,4.00,4.00,4.67,5.00,3.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
36144,1.00,5.00,4.25,4.26,5.00,1.00,1.00,4.21,5.00,5.00,4.27,3.00,4.26,5.00,2.00,5.00,4.37,2.00,2.00,4.33,5.00,2.50,5.00,4.20,4.06,5.00,3.00,4.36,5.00,2.00,4.48,5.00,2.50,4.60,3.50,5.00,3.50,4.70,5.00,5.00,...,5.00,5.00,3.80,2.00,5.00,4.29,3.00,4.50,5.00,4.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
88778,0.50,5.00,4.21,4.14,5.00,0.50,0.50,4.19,5.00,5.00,4.33,0.50,4.17,5.00,0.50,5.00,3.81,0.50,0.50,4.12,5.00,0.50,5.00,4.14,4.28,5.00,0.50,4.16,5.00,0.50,4.37,5.00,0.50,4.35,0.50,5.00,3.00,4.32,5.00,5.00,...,5.00,4.50,4.11,3.00,5.00,4.30,1.50,3.50,5.00,0.50,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
45731,1.00,5.00,3.87,3.98,5.00,1.00,1.00,3.84,5.00,5.00,3.76,1.00,3.84,5.00,1.00,5.00,3.89,1.00,2.00,3.98,5.00,2.00,5.00,4.10,3.61,5.00,1.00,3.74,5.00,1.00,3.89,5.00,2.00,3.79,1.00,5.00,3.00,4.21,5.00,5.00,...,5.00,5.00,3.64,2.00,5.00,4.00,2.00,3.57,5.00,3.00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2933,1.00,5.00,2.61,2.61,5.00,1.00,1.00,2.55,5.00,5.00,2.64,1.00,2.73,5.00,1.00,5.00,2.68,1.00,1.00,2.82,5.00,1.00,5.00,2.56,2.90,5.00,1.00,2.83,5.00,1.00,2.97,5.00,1.00,2.70,1.00,5.00,2.00,3.00,4.00,4.00,...,5.00,5.00,2.84,1.00,3.00,3.00,3.00,2.55,3.00,1.00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73557,0.50,5.00,3.40,3.55,5.00,0.50,0.50,3.47,5.00,5.00,3.18,0.50,3.23,5.00,0.50,5.00,3.48,1.00,1.00,3.40,5.00,0.50,5.00,3.42,3.49,5.00,1.00,3.29,5.00,0.50,3.48,5.00,1.00,3.27,1.00,5.00,1.50,3.46,5.00,5.00,...,5.00,5.00,3.40,1.00,5.00,3.94,2.50,3.27,4.50,1.50,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
91785,1.00,5.00,3.50,3.54,5.00,1.50,1.00,3.45,5.00,5.00,3.48,1.50,3.39,5.00,1.00,5.00,3.55,1.00,1.50,3.52,5.00,1.50,5.00,3.50,3.39,5.00,1.50,3.53,5.00,1.50,3.72,5.00,2.50,3.79,2.00,5.00,1.50,3.15,4.00,5.00,...,5.00,4.50,3.73,3.00,5.00,3.51,2.00,3.00,4.00,1.50,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,1.00,2.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
63669,0.50,5.00,3.39,3.56,5.00,1.00,1.00,3.29,5.00,5.00,3.27,1.00,3.28,5.00,0.50,5.00,3.38,1.00,1.00,3.42,5.00,1.00,5.00,3.40,3.48,5.00,0.50,3.41,5.00,0.50,3.57,5.00,1.00,3.42,1.00,5.00,1.00,3.83,5.00,5.00,...,5.00,5.00,3.45,1.50,5.00,3.72,2.00,3.58,5.00,1.50,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
84412,1.00,5.00,3.39,3.84,5.00,1.00,1.00,3.57,5.00,5.00,3.14,1.00,3.24,5.00,1.00,5.00,3.47,1.00,2.00,3.52,5.00,1.00,5.00,3.22,3.27,5.00,1.00,2.86,5.00,1.00,3.80,5.00,1.00,3.18,1.00,5.00,5.00,5.00,5.00,4.00,...,5.00,5.00,3.40,3.00,3.00,3.00,3.00,3.40,5.00,2.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [46]:
X_train = im_x.fit_transform(X_train)

In [47]:
Xs = sc_x.fit_transform(X_train)
ys = sc_y.fit_transform(y_train)

In [48]:
Xs

array([[0.11111111, 1.        , 0.76445118, ..., 0.25      , 0.25      ,
        0.5       ],
       [0.11111111, 1.        , 0.7993157 , ..., 0.25      , 0.25      ,
        0.5       ],
       [0.        , 1.        , 0.78855693, ..., 0.25      , 0.25      ,
        0.5       ],
       ...,
       [0.        , 1.        , 0.56831061, ..., 0.        , 0.        ,
        0.        ],
       [0.11111111, 1.        , 0.56800123, ..., 0.25      , 0.25      ,
        0.5       ],
       [0.        , 1.        , 0.54404551, ..., 0.25      , 0.25      ,
        0.5       ]])

In [49]:
ys

array([[1.        ],
       [0.77777778],
       [0.66666667],
       ...,
       [0.77777778],
       [1.        ],
       [0.11111111]])

## Modelado

### K-Vecinos más cercanos

#### Cross validation

In [None]:
knn?

In [None]:
knn = KNeighborsRegressor(n_neighbors=100)
knn.fit(Xs, ys)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=100, p=2,
                    weights='uniform')

In [None]:
ls_scores = cross_val_score(X=Xs, y=ys, cv=4, n_jobs=-1, estimator=knn, scoring="r2")

In [None]:
np.mean(ls_scores), np.std(ls_scores)

(0.17598657633778197, 0.003904807011753219)

#### Hyperparametrización

In [None]:
 range(5, 105, 5)

range(5, 105, 5)

In [None]:
param_grid = {"n_neighbors": range(5, 50, 5),
              "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
              "metric": ["euclidean", "manhattan", "chebyshev", "minkowski", "seuclidean", "mahalanobis", ]}

In [None]:
X.shape

(100836, 54)

In [None]:
pd.to_pickle(knn, "k_vecinos.diplo")

In [None]:
dim = get_dim(param_grid)
dim

216

In [None]:
rs = RandomizedSearchCV(cv=4, error_score=-1000, estimator=knn, n_jobs=-1, scoring="r2", param_distributions=param_grid, verbose=5, n_iter=dim*0.1)

In [None]:
rs.fit(Xs, ys)

Fitting 4 folds for each of 21 candidates, totalling 84 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed: 15.9min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed: 200.4min
[Parallel(n_jobs=-1)]: Done  84 out of  84 | elapsed: 222.0min finished


RandomizedSearchCV(cv=4, error_score=-1000,
                   estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                                 metric='minkowski',
                                                 metric_params=None,
                                                 n_jobs=None, n_neighbors=100,
                                                 p=2, weights='uniform'),
                   iid='deprecated', n_iter=21.6, n_jobs=-1,
                   param_distributions={'algorithm': ['auto', 'ball_tree',
                                                      'kd_tree', 'brute'],
                                        'metric': ['euclidean', 'manhattan',
                                                   'chebyshev', 'minkowski',
                                                   'seuclidean',
                                                   'mahalanobis'],
                                        'n_neighbors': range(5, 50, 5)},
              

In [None]:
rs.best_score_

In [None]:
rs.best_estimator_

### Naïve Bayes

#### Cross validation

In [51]:
y_train

Unnamed: 0,rating
88568,5.00
36144,4.00
88778,3.50
45731,4.00
2933,2.00
...,...
73557,2.00
91785,3.50
63669,4.00
84412,5.00


In [52]:
y_train.describe()

Unnamed: 0,rating
count,70585.0
mean,3.5
std,1.04
min,0.5
25%,3.0
50%,3.5
75%,4.0
max,5.0


In [53]:
yc = (y_train >= 5)*1
yc_test = (y_test >= 5)*1

In [55]:
yc

Unnamed: 0,rating
88568,1
36144,0
88778,0
45731,0
2933,0
...,...
73557,0
91785,0
63669,0
84412,1


In [None]:
nb?

In [None]:
nb = GaussianNB()
nb.fit(Xs, yc.values.ravel())

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
ls_scores = cross_val_score(X=Xs, y=yc, cv=4, n_jobs=-1, estimator=nb, scoring="roc_auc")

In [None]:
np.mean(ls_scores), np.std(ls_scores)

(0.7301915259542806, 0.004626094998410264)

In [None]:
df

Unnamed: 0,userId,movieId,rating,rating_min,rating_max,rating_mean,mean_rating_Drama,max_rating_Drama,min_rating_Drama,min_rating_Thriller,...,sci,space,superhero,surreal,suspense,thought,travel,twist,visually,war
0,1,1,4.00,1.00,5.00,4.37,4.53,5.00,1.00,1.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,1,3,4.00,1.00,5.00,4.37,4.53,5.00,1.00,1.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,1,6,4.00,1.00,5.00,4.37,4.53,5.00,1.00,1.00,...,,,,,,,,,,
3,1,47,5.00,1.00,5.00,4.37,4.53,5.00,1.00,1.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00
4,1,50,5.00,1.00,5.00,4.37,4.53,5.00,1.00,1.00,...,0.00,0.00,0.00,0.00,1.00,0.00,0.00,1.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,166534,4.00,0.50,5.00,3.69,3.87,5.00,1.00,0.50,...,,,,,,,,,,
100832,610,168248,5.00,0.50,5.00,3.69,3.87,5.00,1.00,0.50,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
100833,610,168250,5.00,0.50,5.00,3.69,3.87,5.00,1.00,0.50,...,,,,,,,,,,
100834,610,168252,5.00,0.50,5.00,3.69,3.87,5.00,1.00,0.50,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [None]:
df[(df["movieId"] == 170875) & (df["userId"] == 610)]

Unnamed: 0,userId,movieId,rating,rating_min,rating_max,rating_mean,mean_rating_Drama,max_rating_Drama,min_rating_Drama,min_rating_Thriller,...,sci,space,superhero,surreal,suspense,thought,travel,twist,visually,war
100835,610,170875,3.0,0.5,5.0,3.69,3.87,5.0,1.0,0.5,...,,,,,,,,,,


In [None]:
df