In [1]:
from sklearn.pipeline import make_pipeline, make_union
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from pathlib import Path

import pandas as pd

In [7]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [8]:
import sys
sys.path.append('/content/gdrive/MyDrive/ml-practico/data') # tiene bien la barra

In [9]:
from lib import data, transformers # usa una nueva version de lib
from lib.model import get_features_pipe, get_model_pipe

In [5]:
PATH = Path('/content/gdrive/MyDrive/ml-practico/data/') # tiene bien la barra
movies_df = data.load_data(PATH)

Loading title basics...


  exec(code_obj, self.user_global_ns, self.user_ns)


Loading title ratings...
Loading movie directors...
Merging everything...


In [6]:
principals_df = pd.read_csv(PATH / 'title.principals.tsv', sep='\t')

In [10]:
principals_df.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0374658,cinematographer,director of photography,\N
3,tt0000002,1,nm0721526,director,\N,\N
4,tt0000002,2,nm1335271,composer,\N,\N


In [11]:
principals_df.category.value_counts()

actor                  9428011
actress                7087536
self                   6992466
writer                 5443248
director               4739306
producer               2533985
cinematographer        1472541
composer               1461940
editor                 1375324
production_designer     302528
archive_footage         245691
archive_sound             2605
Name: category, dtype: int64

In [12]:
# Sacado del codigo de directores, al director le calcula los features

movies_stars = principals_df[principals_df.category.isin(['actress', 'actor'])].copy() # solo actrices y actores

# Calculo un ranking por pelicula segun el ordering
movies_stars['star_rank'] = (
    movies_stars.sort_values('ordering')
        .groupby('tconst')
        .cumcount()
) # cual es el que aparece primero, segundo, etc 

first_star = movies_stars[movies_stars.star_rank == 0][['nconst', 'tconst']].rename(columns={'nconst': '1st_star'})
second_star = movies_stars[movies_stars.star_rank == 1][['nconst', 'tconst']].rename(columns={'nconst': '2nd_star'})
third_star = movies_stars[movies_stars.star_rank == 2][['nconst', 'tconst']].rename(columns={'nconst': '3rd_star'})

In [13]:
stars_df = (
    first_star.merge(second_star, how='left', on='tconst')
              .merge(third_star, how='left', on='tconst')
) # no repetimos filas de la misma peli, una peli es una fila

In [14]:
stars_df.head()

Unnamed: 0,1st_star,tconst,2nd_star,3rd_star
0,nm0443482,tt0000005,nm0653042,
1,nm0179163,tt0000007,nm0183947,
2,nm0653028,tt0000008,,
3,nm0063086,tt0000009,nm0183823,nm1309758
4,nm3692297,tt0000011,,


In [15]:
stars_df[stars_df.tconst == 'tt0120338'] # titanic

Unnamed: 0,1st_star,tconst,2nd_star,3rd_star
107363,nm0000138,tt0120338,nm0000701,nm0000708


In [16]:
movies_df = movies_df.merge(stars_df, on='tconst', how='left')

In [17]:
movies_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,director,1st_star,2nd_star,3rd_star
0,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894.0,\N,45.0,[Romance],5.9,154,nm0085156,nm0063086,nm0183823,nm1309758
1,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897.0,\N,20.0,"[Documentary, News, Sport]",5.2,356,nm0714557,,,
2,tt0000502,movie,Bohemios,Bohemios,0,1905.0,\N,100.0,[no-genre],3.8,6,nm0063413,nm0215752,nm0252720,
3,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906.0,\N,70.0,"[Biography, Crime, Drama]",6.1,589,nm0846879,nm0846887,nm0846894,nm3002376
4,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908.0,\N,120.0,"[Adventure, Fantasy]",5.2,37,nm0091767,nm0000875,nm0122665,nm0933446


# Ahora  vamos a experimentar!

Como podemos hacer para usar 1st_star y 2nd_star con el código que **ya** tenemos? [Miremos el diff](https://github.com/elsonidoq/machine_learning_practico/commit/1244da3daee2f7aff140d202885e6e8dba55c099)

In [18]:
rating_data = data.load_rating_train_dev_test(movies_df)

**Mi unico feature va a ser el protagonista para predecir si la peli va a tener mas de 7.5 en raiting.**

In [19]:
pipe = make_pipeline(
    transformers.CrewFeatures('1st_star', min_cnt_movies=3), # cantidad minima de pelis=3
    DictVectorizer(sparse=False), # lo vectorizo
    StandardScaler(),
    LogisticRegression() # regresion logistica 
)

In [20]:
pipe.fit(rating_data['X_train'], rating_data['y_train'] > 7.5)

tr_auc = roc_auc_score(rating_data['y_train'] > 7.5, pipe.predict_proba(rating_data['X_train'])[:, 1])
dev_auc = roc_auc_score(rating_data['y_dev'] > 7.5, pipe.predict_proba(rating_data['X_dev'])[:, 1])

tr_auc, dev_auc

(0.6694917991958195, 0.5516081278474014)

**Roc auc menor a 0.5 es muy pobre, es peor que chance. Si cambio a min_cnt_movies=1, training da bastante mejor y validation da un poco peor.**

**meto a los 3 protagonistas y me fijo**

In [21]:
pipe = make_pipeline(
    make_union(
        make_pipeline(transformers.CrewFeatures('1st_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.CrewFeatures('2nd_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.CrewFeatures('3rd_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
    ),
    StandardScaler(),
    LogisticRegression()
)

In [22]:
pipe.fit(rating_data['X_train'], rating_data['y_train'] > 7.5)

tr_auc = roc_auc_score(rating_data['y_train'] > 7.5, pipe.predict_proba(rating_data['X_train'])[:, 1])
dev_auc = roc_auc_score(rating_data['y_dev'] > 7.5, pipe.predict_proba(rating_data['X_dev'])[:, 1])

tr_auc, dev_auc

(0.7266249341879818, 0.5782582569516197)

**development subio un poco pero el overfitting subio un monton (mejor en training).**

# Probando todo junto

**actores, director, generos, anios**

In [23]:
pipe = make_pipeline(
    make_union(
        make_pipeline(transformers.CrewFeatures('1st_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.CrewFeatures('2nd_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.CrewFeatures('3rd_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.CrewFeatures('director', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(transformers.GenreDummies(), DictVectorizer(sparse=False)),
    ),
    StandardScaler(),
    LogisticRegression()
)

In [24]:
pipe.fit(rating_data['X_train'], rating_data['y_train'] > 7.5)

tr_auc = roc_auc_score(rating_data['y_train'] > 7.5, pipe.predict_proba(rating_data['X_train'])[:, 1])
dev_auc = roc_auc_score(rating_data['y_dev'] > 7.5, pipe.predict_proba(rating_data['X_dev'])[:, 1])

tr_auc, dev_auc

(0.8433724090930335, 0.7368580621136858)

**subio un monton. Ahora dejo solo director, years ago y generos.** 

In [25]:
pipe = make_pipeline(
    make_union(
        make_pipeline(transformers.CrewFeatures('director', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(transformers.GenreDummies(), DictVectorizer(sparse=False)),
    ),
    StandardScaler(),
    LogisticRegression() # es un modelo lineal
)

In [26]:
pipe.fit(rating_data['X_train'], rating_data['y_train'] > 7.5)

tr_auc = roc_auc_score(rating_data['y_train'] > 7.5, pipe.predict_proba(rating_data['X_train'])[:, 1])
dev_auc = roc_auc_score(rating_data['y_dev'] > 7.5, pipe.predict_proba(rating_data['X_dev'])[:, 1])

tr_auc, dev_auc

(0.8189300134330326, 0.7315218069641523)

**en development dio igual, en training dio peor. Es decir que los actores solo nos suman overfitting.**

**regresion lineal es un modelo lineal. Un modelo no lineal seria word2vec.**

**saco el director dejo solo years ago y genero:**

In [27]:
pipe = make_pipeline(
    make_union(
        make_pipeline(transformers.YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(transformers.GenreDummies(), DictVectorizer(sparse=False)),
    ),
    StandardScaler(),
    LogisticRegression()
)

In [28]:
pipe.fit(rating_data['X_train'], rating_data['y_train'] > 7.5)

tr_auc = roc_auc_score(rating_data['y_train'] > 7.5, pipe.predict_proba(rating_data['X_train'])[:, 1])
dev_auc = roc_auc_score(rating_data['y_dev'] > 7.5, pipe.predict_proba(rating_data['X_dev'])[:, 1])

tr_auc, dev_auc

(0.7419233952069703, 0.7288106516906283)

**y bajo un poquito el development. Al fin y al cabo todo lo que estamos prediciendo es con el genero.**

**ahora prueba con un modelo no lineal (GradientBoostingClassifier), tiene muchos mas grados de libertad.**

In [29]:
from sklearn.ensemble import GradientBoostingClassifier

pipe = make_pipeline(
    make_union(
        make_pipeline(transformers.CrewFeatures('1st_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.CrewFeatures('2nd_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.CrewFeatures('3rd_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.CrewFeatures('director', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(transformers.GenreDummies(), DictVectorizer(sparse=False)),
    ),
    GradientBoostingClassifier(),
)

In [30]:
pipe.fit(rating_data['X_train'], rating_data['y_train'] > 7.5)

tr_auc = roc_auc_score(rating_data['y_train'] > 7.5, pipe.predict_proba(rating_data['X_train'])[:, 1])
dev_auc = roc_auc_score(rating_data['y_dev'] > 7.5, pipe.predict_proba(rating_data['X_dev'])[:, 1])

tr_auc, dev_auc

(0.9175962044769267, 0.7461476500702391)

**sube mucho en training cambio de pocos puntos mas en development. es bastante de overfitting. habria que hacer regularizacion con los hiperparametros del gradientboostingclassifier: min sample split, min samples leaf, y agrandar min cnt movies.** 

# Word2Vec features

In [32]:
class EpochSaver: pass

from gensim.models import Word2Vec

w2v = Word2Vec.load('/content/gdrive/MyDrive/ml-practico/data/w2v/epoch_10')

In [34]:
import numpy as np

default_vector = np.mean(w2v.wv.vectors, axis=0) # dim 100
# ver a quien se parece este default vector

**copia y pega todo de crew features (adentro de lib)**

### TO DO: revisar a quien se parece este default_vector

Seria mejor un vector se 0s? alguna otra agregacion sobre los datos?

In [37]:
x_i = rating_data['X_train'][0] # un ejemplo puntual a mano
fields = ['1st_star', '2nd_star', '3rd_star', 'director']
min_cnt_movies = 2

vectors = []
for field in fields:
    person_id = x_i[field] # quien es la persona
    if person_id not in w2v.wv or w2v.wv.vocab[person_id].count < min_cnt_movies: continue
       # hay gente q no esta en el w2v (tiene un min count de 5) por eso es necesario tmb decirle que pase si no esta 
    vectors.append(w2v.wv[person_id])

if len(vectors) == 0: # si la lista esta vacia
    result = default_vector
else:
    result = np.mean(vectors, axis=0)

In [36]:
vectors # tiene 2 vectores
#vectors puede ser una lista vacia, necesito un defalt vector

[array([ 4.70933318e-02, -1.34612575e-01,  2.46635266e-02, -9.10148472e-02,
        -5.29428683e-02, -2.28914514e-01, -1.33385265e-03,  1.14006080e-01,
        -3.01423483e-02, -4.42864746e-02, -1.00941554e-01,  1.62525754e-02,
        -6.02971539e-02,  1.27083093e-01,  1.95889965e-01,  2.96445161e-01,
         7.73383901e-02, -2.71949302e-02, -1.65050011e-02, -6.02033846e-02,
        -3.97554077e-02, -2.78549612e-01,  3.22493702e-01, -4.27541047e-01,
         5.54660661e-03,  3.09594840e-01, -1.15219213e-01, -3.38158645e-02,
        -1.66705281e-01, -7.77427405e-02, -2.26080328e-01, -1.14626020e-01,
        -3.80306780e-01,  4.18189675e-01,  2.73696274e-01, -9.26242024e-03,
        -5.73593352e-05,  1.45971522e-01,  1.55290663e-01,  5.52690923e-02,
        -1.41683370e-01,  1.56746414e-02,  3.04484181e-02, -1.89360473e-02,
        -5.05663417e-02,  1.71247214e-01,  2.95282546e-02,  7.68444613e-02,
        -7.57885575e-02,  2.74948359e-01, -3.95549506e-01, -1.06983632e-02,
        -4.2

In [39]:
((vectors[0]+vectors[1])/2) == result

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [40]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class W2VCrewFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, w2v, fields, min_cnt_movies=2):
        self.fields = fields
        self.min_cnt_movies = min_cnt_movies
        self.w2v = w2v

    def fit(self, X, y):
        self.default_vector_ = np.mean(w2v.wv.vectors, axis=0) # no depende de x y de y 
        return self
    
    def _get_movie_vector(self, x_i): #x_i=movie
        vectors = []
        for field in self.fields:
            person_id = x_i[field]
            if person_id not in self.w2v.wv or self.w2v.wv.vocab[person_id].count < self.min_cnt_movies: continue

            vectors.append(self.w2v.wv[person_id])

        if len(vectors) == 0:
            return self.default_vector_
        else:
            return np.mean(vectors, axis=0)
            
    def transform(self, X):
        return np.asarray([self._get_movie_vector(x_i) for x_i in X])

In [41]:
pipe = make_pipeline(
    W2VCrewFeatures(w2v, ['1st_star', '2nd_star', '3rd_star']),
    StandardScaler(),
    LogisticRegression()
)

In [42]:
pipe.fit(rating_data['X_train'], rating_data['y_train'] > 7.5)

tr_auc = roc_auc_score(rating_data['y_train'] > 7.5, pipe.predict_proba(rating_data['X_train'])[:, 1])
dev_auc = roc_auc_score(rating_data['y_dev'] > 7.5, pipe.predict_proba(rating_data['X_dev'])[:, 1])

tr_auc, dev_auc

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


(0.7031829968700183, 0.6403970210810784)

**antes estabamos en 0.72 y 0.57 con el modelo lineal. El w2vec generaliza mucho mejor : 0.64 en development.** 

In [43]:
pipe = make_pipeline( # actor mas director dan algo de 100
    W2VCrewFeatures(w2v, ['1st_star', '2nd_star', '3rd_star', 'director']),
    StandardScaler(),
    LogisticRegression()
)

In [44]:
pipe.fit(rating_data['X_train'], rating_data['y_train'] > 7.5)

tr_auc = roc_auc_score(rating_data['y_train'] > 7.5, pipe.predict_proba(rating_data['X_train'])[:, 1])
dev_auc = roc_auc_score(rating_data['y_dev'] > 7.5, pipe.predict_proba(rating_data['X_dev'])[:, 1])

tr_auc, dev_auc

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


(0.7238369663327426, 0.664699201818925)

**agregamos el director y subio de 0.64 a 0.66 y tmb subio un poco en training.**

In [45]:
# aca con los actores por un lado con un vector de 100
# los directores con un vector de 100
#sumados dan algo de 200
pipe = make_pipeline(
    make_union(
        W2VCrewFeatures(w2v, ['1st_star', '2nd_star', '3rd_star']),
        W2VCrewFeatures(w2v, ['director'])
    ),
    StandardScaler(),
    LogisticRegression()
)

In [46]:
pipe.fit(rating_data['X_train'], rating_data['y_train'] > 7.5)

tr_auc = roc_auc_score(rating_data['y_train'] > 7.5, pipe.predict_proba(rating_data['X_train'])[:, 1])
dev_auc = roc_auc_score(rating_data['y_dev'] > 7.5, pipe.predict_proba(rating_data['X_dev'])[:, 1])

tr_auc, dev_auc

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


(0.7314872415740084, 0.6598542577550959)

**dio lo mismo en test, pero overfitteo un poco mas, tiene mas complejidad este modelo.** 

In [None]:
pipe = make_pipeline(
    make_union(
        W2VCrewFeatures(w2v, ['1st_star', '2nd_star', '3rd_star']),
        make_pipeline(transformers.CrewFeatures('1st_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.CrewFeatures('2nd_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.CrewFeatures('3rd_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
    ),
    GradientBoostingClassifier()
)

In [None]:
pipe.fit(rating_data['X_train'], rating_data['y_train'] > 7.5)

tr_auc = roc_auc_score(rating_data['y_train'] > 7.5, pipe.predict_proba(rating_data['X_train'])[:, 1])
dev_auc = roc_auc_score(rating_data['y_dev'] > 7.5, pipe.predict_proba(rating_data['X_dev'])[:, 1])

tr_auc, dev_auc