# Clasificación - modelos supervisados a partir de la matriz F2V

Este Notebook entrena modelos de clasificación para predecir si una película es de inmigración o no a partir de la matriz $F2V$, y toma las probabilidades predichas por el mejor modelo como índice de contenido de inmigración de cada película.

1. [Calcular F2V](#f2v)
2. [Modelos de clasificación](#clasif)
3. [Probabilidades predichas como índice de inmigración](#proba)


In [None]:
# Importamos librerías 

## Módulos generales
from libraries import *

## Módulos con funciones creadas para este trabajo
## (requieren de haber importado previamente los módulos generales)
from limpieza_subt import *
from clustering import *

In [None]:
# Completar con directorios 
gitwd = ""
datawd = ""

## Importar datos

In [None]:
l2v = pd.read_pickle(datawd + "/l2v.pkl")

with open(datawd + "/tfidf.pkl", 'rb') as inputfile: 
    tfidf = pickle.load(inputfile)
      
filmids = pd.read_pickle(datawd + "/filmids.pkl")
master = pd.read_csv(datawd + "/titles_master.csv")

with open(datawd + "/stoi.pkl", 'rb') as inputfile: 
    stoi = pickle.load(inputfile)

In [None]:
# Algo de limpieza
l2v =l2v.sort_values("stoi").reset_index(drop = True) # ordenar valores L2V por STOI
tfidf = tfidf.toarray()

filmids = filmids[["tconst", "filmid"]].merge(master,
                                   on = "tconst",
                                   how = "left")

<a id='f2v'></a>

## F2V: matriz de variables explicativas
La matriz de features F2V (película-a-vector) se calcula como TFIDF x L2V

In [None]:
tfidf = tfidf[:,2:] # quitar columnas correspondientes a PAD y UNK
print(tfidf.shape) 
print(l2v.iloc[:,2:-1].shape) # quitar columnas que no son dimensiones

# films to vec!
f2v = tfidf.dot(l2v.iloc[:,2:-1]) 
f2v = pd.DataFrame(f2v)
f2v.columns = ["dim_" + str(x) for x in f2v.columns]
f2v = pd.concat([filmids, f2v], axis = 1) # agregamos los ids y otros datos de las películas, como la respuesta

Quitamos películas extra-inimigración para este análisis

In [None]:
f2v = f2v[(f2v.main == 1) | (f2v.before2000 == 1)].reset_index(drop = True)
f2v.shape

f2v.to_pickle(datawd + "/tfidf2vec.pkl")
f2v[["tconst", "just_migra"]].to_csv(datawd + "/target.csv", index = False) # también guardamos la variable de respuesta por separado

<a id='clasif'></a>

## Modelos de clasificación

In [None]:
f2v = pd.read_pickle(datawd + f"/tfidf2vec.pkl")

###  X e y

In [None]:
# X son siempre los vectores
dims = [x for x in f2v.columns if "dim_" in x]
X =  f2v[dims]

# y es la variable de respuesta: si una película es de inmigración o no (variable just_migra)
y = f2v.loc[:,"just_migra"]
print(np.mean(y)) # Mucho debalance!!

### Multicolinealidad

In [None]:
multic = abs(X.corr(method='pearson'))

result_df2 =(multic>0.5)  & (multic < 1)  
result_df2 = np.sum(result_df2, axis = 0)  

print(np.sum(result_df2 > 0)) 
print(np.sum(result_df2 > np.percentile(result_df2, 95))) 

# Quitar dimensiones con alta multicolinealidad
okvars = result_df2.index[result_df2 < np.percentile(result_df2, 95)]
print(len(okvars))
X_corrected = X[okvars]

### Split train - test y SMOTE para sobremuestrar

In [None]:
## Hacemos el split train-test
X_train, X_test, y_train, y_test = train_test_split(X_corrected,y,test_size = 0.25, random_state = 42)

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

print(np.mean(y_train),
      np.mean(y_res))

## Creamos una lista donde vamos a guardar todos los hiperparámetros ajustados
hyperparameters = {}

#### Comparar con y sin SMOTE para logit-lasso

In [None]:
lr = LogisticRegression()
parameters = {'penalty': ['l1', 'l2', 'none'],
              'solver': ['saga'],
               'C' : [0.5, 1, 5],
               'max_iter': [20000]}

# con  SMOTE
grid_obj = GridSearchCV(lr, parameters, scoring='roc_auc', cv=3)
grid_fit = grid_obj.fit(X_res,y_res)
lr_best_params = grid_obj.best_params_
lr = LogisticRegression(**lr_best_params)
lr = lr.fit(X_res, y_res)
y_train_pred_lr = lr.predict(X_train)
y_test_pred_lr = lr.predict(X_test)
y_res_pred_lr = lr.predict(X_res)

print(lr_best_params)
print(np.mean(y_test_pred_lr))

print(balanced_accuracy_score(y_test, y_test_pred_lr),
      roc_auc_score(y_test, y_test_pred_lr),
      accuracy_score(y_test, y_test_pred_lr))

# sin SMOTE
grid_obj2 = GridSearchCV(lr, parameters, scoring='roc_auc', cv=3)
grid_fit2 = grid_obj2.fit(X_train,y_train)
lr_best_params2 = grid_obj2.best_params_
lr = LogisticRegression(**lr_best_params2)
lr = lr.fit(X_train, y_train)
y_train_pred_lr = lr.predict(X_train)
y_test_pred_lr = lr.predict(X_test)

print(lr_best_params2)
print(np.mean(y_test_pred_lr))

print(balanced_accuracy_score(y_test, y_test_pred_lr),
      roc_auc_score(y_test, y_test_pred_lr),
      accuracy_score(y_test, y_test_pred_lr))

### Entrenar varios modelos con SMOTE

In [None]:
# Naive Bayes
start_time = time.time()
gnb = GaussianNB()
gnb = gnb.fit(X_res, y_res)
a = time.time()
print("Naive Bayes:" , (a - start_time)/ 60)

# Logit-Lasso
lr = LogisticRegression()
parameters = {'penalty': ['l1', 'l2', 'none'],
              'solver': ['saga'],
              'max_iter': [20000]}
grid_obj = GridSearchCV(lr, parameters, scoring='roc_auc', cv=3)
grid_fit = grid_obj.fit(X_res,y_res)
lr_best_params = grid_obj.best_params_
hyperparameters['LogisticRegression'] = lr_best_params
lr = LogisticRegression(**lr_best_params)
lr = lr.fit(X_res, y_res)
b = time.time()
print("Logit:", (b-a)/ 60)

# Random Forest Classifier
rfclass = RandomForestClassifier(random_state=42)
parameters = {'n_estimators': [100, 200],
              'max_depth' : [5,10],
              'criterion' :['gini', 'entropy'],
              'min_impurity_decrease': [0.001,0.1]}
grid_obj = GridSearchCV(rfclass, parameters, scoring='roc_auc', cv=3)
grid_fit = grid_obj.fit(X_res, y_res)
rfclass_best_params = grid_obj.best_params_
hyperparameters['RandomForest'] = rfclass_best_params
rfclass = RandomForestClassifier(**rfclass_best_params, random_state=42)
rfclass = rfclass.fit(X_res,y_res)
c = time.time()
print("Random Forest:", (c-b)/ 60)

#### Linear Discriminant Analysis
ldaclass = LinearDiscriminantAnalysis()
parameters = {'tol' : [0.0001,0.0003]}
grid_obj = GridSearchCV(ldaclass, parameters, scoring='roc_auc',cv=3)
grid_fit = grid_obj.fit(X_res, y_res)
lda_best_params = grid_obj.best_params_
hyperparameters['LinearDiscriminantAnalysis'] = lda_best_params
ldaclass = LinearDiscriminantAnalysis(**lda_best_params)
ldaclass = ldaclass.fit(X_res, y_res)
d = time.time()
print("LDA:", (d-c)/ 60)


### Quadratic Discriminant Analysis
# a. Tuneamos los hiperparámetros por CV
qdaclass = QuadraticDiscriminantAnalysis()
parameters = {'reg_param': [0.05, 0.15 ,0.3, 0.5]}
grid_obj = GridSearchCV(qdaclass, parameters, scoring='roc_auc',cv=3)
grid_fit = grid_obj.fit(X_res, y_res)
qda_best_params = grid_obj.best_params_
hyperparameters['QuadraticDiscriminantAnalysis'] = qda_best_params
# b. Ajustamos el modelo y predecimos
qdaclass = QuadraticDiscriminantAnalysis(**qda_best_params)
qdaclass = qdaclass.fit(X_res, y_res)
e = time.time()
print("QDA:", (e-d)/ 60)

# 6. K Nearest Neighbors
# a. Tuneamos los hiperparámetros por CV
knnclass = KNeighborsClassifier()
parameters = {'n_neighbors':[1,5,9,21],
              'weights': ['uniform','distance'],
              'metric': ['euclidean','cosine']}
grid_obj = GridSearchCV(knnclass, parameters, scoring='roc_auc', cv=3)
grid_fit = grid_obj.fit(X_res, y_res)
knnclass_best_params = grid_obj.best_params_
hyperparameters['KNeighbors'] = knnclass_best_params
# b. Ajustamos el modelo y predecimos
knnclass = KNeighborsClassifier(**knnclass_best_params)
knnclass = knnclass.fit(X_res,y_res)
f = time.time()
print("KNN", (f-e)/ 60)

### Predicciones y métricas

In [None]:
# Predicciones train y test para las métricas (sobre X SIN SMOTE)
y_train_pred_gbn = gnb.predict(X_train)
y_test_pred_gbn = gnb.predict(X_test)

y_train_pred_lr = lr.predict(X_train)
y_test_pred_lr = lr.predict(X_test)

y_train_pred_rfclass = rfclass.predict(X_train)
y_test_pred_rfclass = rfclass.predict(X_test)

y_train_pred_ldaclass = ldaclass.predict(X_train)
y_test_pred_ldaclass = ldaclass.predict(X_test)

y_train_pred_qdaclass = qdaclass.predict(X_train)
y_test_pred_qdaclass = qdaclass.predict(X_test)

y_train_pred_knnclass = knnclass.predict(X_train)
y_test_pred_knnclass = knnclass.predict(X_test)

In [None]:
# Guardar métricas e hiperparámetros
test_metrics = pd.DataFrame({'Algorithms':['Gaussian Naive Bayes',
                                         'Linear Discriminant Analysis',
                                         'Quadratic Discriminant Analysis',
                                         'Logistic Regression',
                                         'Random Forest Classifier',
                                         'K Nearest Neighbors'],
                             'ROC-scores':[roc_auc_score(y_test,y_test_pred_gbn),
                                           roc_auc_score(y_test,y_test_pred_ldaclass),
                                           roc_auc_score(y_test,y_test_pred_qdaclass),
                                           roc_auc_score(y_test,y_test_pred_lr),
                                           roc_auc_score(y_test,y_test_pred_rfclass),
                                           roc_auc_score(y_test,y_test_pred_knnclass)],
                           'Balanced_acc-scores':[   balanced_accuracy_score(y_test,y_test_pred_gbn),
                                                      balanced_accuracy_score(y_test,y_test_pred_ldaclass),
                                                      balanced_accuracy_score(y_test,y_test_pred_qdaclass),
                                                      balanced_accuracy_score(y_test,y_test_pred_lr),
                                                      balanced_accuracy_score(y_test,y_test_pred_rfclass),
                                                      balanced_accuracy_score(y_test,y_test_pred_knnclass)],
                                    'Acc-scores':[    accuracy_score(y_test,y_test_pred_gbn),
                                                      accuracy_score(y_test,y_test_pred_ldaclass),
                                                      accuracy_score(y_test,y_test_pred_qdaclass),
                                                      accuracy_score(y_test,y_test_pred_lr),
                                                      accuracy_score(y_test,y_test_pred_rfclass),
                                                      accuracy_score(y_test,y_test_pred_knnclass)]
                         })

train_metrics = pd.DataFrame({'Algorithms':['Gaussian Naive Bayes',
                                         'Linear Discriminant Analysis',
                                         'Quadratic Discriminant Analysis',
                                         'Logistic Regression',
                                         'Random Forest Classifier',
                                         'K Nearest Neighbors'],
                             'ROC-scores':[roc_auc_score(y_train,y_train_pred_gbn),
                                           roc_auc_score(y_train,y_train_pred_ldaclass),
                                           roc_auc_score(y_train,y_train_pred_qdaclass),
                                         roc_auc_score(y_train,y_train_pred_lr),
                                         roc_auc_score(y_train,y_train_pred_rfclass),
                                         roc_auc_score(y_train,y_train_pred_knnclass)],
                           'Balanced_acc-scores':[  balanced_accuracy_score(y_train,y_train_pred_gbn),
                                                      balanced_accuracy_score(y_train,y_train_pred_ldaclass),
                                                      balanced_accuracy_score(y_train,y_train_pred_qdaclass),
                                                      balanced_accuracy_score(y_train,y_train_pred_lr),
                                                      balanced_accuracy_score(y_train,y_train_pred_rfclass),
                                                      balanced_accuracy_score(y_train,y_train_pred_knnclass)],
                                    'Acc-scores':[    accuracy_score(y_train,y_train_pred_gbn),
                                                      accuracy_score(y_train,y_train_pred_ldaclass),
                                                      accuracy_score(y_train,y_train_pred_qdaclass),
                                                      accuracy_score(y_train,y_train_pred_lr),
                                                      accuracy_score(y_train,y_train_pred_rfclass),
                                                      accuracy_score(y_train,y_train_pred_knnclass)]
                         })

train_metrics.to_pickle(datawd + f"/clasif_train_metrics.pkl")
test_metrics.to_pickle(datawd + f"/clasif_test_metrics.pkl")
with open(datawd + f"/clasif_hyperparameters.pkl", 'wb') as outputfile: 
    pickle.dump(hyperparameters, outputfile)


### Visualizar métricas

In [None]:
train_metrics = pd.read_pickle(datawd + f"/clasif_train_metrics.pkl")
test_metrics = pd.read_pickle(datawd + f"/clasif_test_metrics.pkl")

In [None]:
# train
fig, ax = plt.subplots(figsize=(6, 4))
ax = sns.heatmap(train_metrics[['ROC-scores', 'Acc-scores', 'Balanced_acc-scores']], 
                 yticklabels=train_metrics.Algorithms , 
                 annot=True ,
                 cmap = sns.cm.rocket_r)
ax = plt.xticks(rotation=0)
ax = plt.title('Train metrics', fontsize=14)
plt.savefig(datawd + f"/metrics_clasif_TRAIN.png", dpi = 300, bbox_inches='tight')
ax = plt.show()

# test
fig, ax = plt.subplots(figsize=(6, 4))
ax = sns.heatmap(test_metrics[['ROC-scores', 'Acc-scores', 'Balanced_acc-scores']], 
                 yticklabels=test_metrics.Algorithms , 
                 annot=True ,
                 cmap = sns.cm.rocket_r)
ax = plt.xticks(rotation=0)
ax = plt.title('Test metrics', fontsize=14)
plt.savefig(datawd + f"/metrics_clasif_TEST.png", dpi = 300, bbox_inches='tight')
ax = plt.show()


<a id='proba'></a>

## Probabilidades predichas como índice de inmigración

In [None]:
# Tomamos la regresión logística, uno de los mejores modelos, y predecimos las probabilidades sobre todo el conjunto de películas
with open(datawd + f"/clasif_hyperparameters.pkl", 'rb') as outputfile: 
    hyperparameters = pickle.load(outputfile)
lr = LogisticRegression(**hyperparameters['LogisticRegression'])
lr = lr.fit(X_res, y_res)

# Predicciones de probabilidad para el score tomando el mejor modelos
y_pred_proba = lr.predict_proba(X_corrected)

immi_proba = f2v[["tconst", "just_migra"]]
immi_proba["lr_pred_proba"] = y_pred_proba[:,1]
immi_proba["lr_pred"] = lr.predict(X_corrected)

In [None]:
# Matriz de confusión
disp = plot_confusion_matrix(lr, X_test, y_test, normalize='true',cmap='binary')
disp.ax_.set_title("Regresión Logística - matriz de confusión - TEST")
disp.im_.colorbar.remove()

plt.savefig(datawd + f"/conf_matrix_lr.png", dpi = 300, bbox_inches='tight')
plt.show()

In [None]:
# Curva ROC usando probabilidades
fpr, tpr, thresholds = metrics.roc_curve(y_test,  lr.predict_proba(X_test)[:,1], drop_intermediate =True)
i = 100
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                  estimator_name='Regresión Logística')
display.plot()
plt.plot([0, 1], [0, 1], color='black', linestyle='--')
plt.scatter(fpr[i], tpr[i], label = f"Umbral = {round(thresholds[ i ], 3)}", color = "red")
plt.title('Regresión logística - curva ROC - TEST')
plt.legend()
plt.savefig(datawd + f"/curva_roc_lr.png", dpi = 300, bbox_inches='tight')
plt.show()


In [None]:
# Curva ROC usando predicción binaria

fpr, tpr, thresholds = metrics.roc_curve(y_test,  lr.predict(X_test), drop_intermediate =True)

roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                  estimator_name='Regresión Logística')
display.plot()
plt.plot([0, 1], [0, 1], color='black', linestyle='--')

plt.title('Curva ROC en datos de Test - Regresión logística')
plt.legend()
plt.savefig(datawd + f"/curva_roc_lr_b.png", dpi = 300, bbox_inches='tight')
plt.show()


In [None]:
# Probabilidad promedio y porcentaje predicho como de inmigración según etiqueta real
immi_proba.groupby("just_migra")[["lr_pred_proba","lr_pred"]].mean()

In [None]:
# Box-Plot según etiqueta real
plt.figure(figsize=(8, 6))  
sns.boxplot(x='just_migra', y='lr_pred_proba', data=immi_proba, palette='viridis')
plt.axhline(y=0.5, color='red', linestyle='--', linewidth=2)
plt.xlabel('Etiqueta de inmigración según IMDb')
plt.ylabel('Probabilidad Regresión Logística')
plt.title('Boxplots de probabilidad predicha según etiqueta real')
plt.savefig(datawd + f"/boxplot_lr.png", dpi = 300, bbox_inches='tight')
plt.show()

In [None]:
# Histogramas de probabilidad predicha según etiqueta real
plt.figure(figsize=(8, 6), dpi=80)
plt.hist(immi_proba[immi_proba.just_migra == 0].lr_pred_proba, bins = 20, alpha=0.5, label='Películas No Inmigración', density =True)
plt.hist(immi_proba[immi_proba.just_migra == 1].lr_pred_proba, bins = 20, alpha=0.5, label='Películas Inmigración', density = True)
plt.legend(loc='upper right')

plt.savefig(datawd + f"/hist_lr.png", dpi = 300)
plt.show()