# Parte 1

## Creación de la Base de Datos
___

In [1]:
import psycopg2
import pandas as pd
import os
from dotenv import load_dotenv

load_dotenv()

user = os.getenv('POSTGRES_USER')
password = os.getenv('POSTGRES_PASSWORD')
host = os.getenv('POSTGRES_DBHOST')
dbname = os.getenv('POSTGRES_DBNAME')
# print(f'user_{user}_password_{password}_host_{host}_dbname_{dbname}')

In [2]:
conn = psycopg2.connect(user=user, host=host, port=5432, password=password)
conn.set_session(autocommit=True)

# Obtención de Cursor
cursor          = conn.cursor();
name_Database   = "prueba";

# Eliminación de base de datos en caso de existir
cursor.execute("DROP DATABASE IF EXISTS prueba;")

# Creación de sentencia para la base de datos
sqlCreateDatabase = "create database "+ name_Database +";"

# Creacion de la base de datos en PostgreSQL
cursor.execute(sqlCreateDatabase);

In [3]:
cursor.close()
conn.close()

## Obtención y tratamiento de las columnas del CSV
___

In [4]:
df = pd.read_csv('train_cupid.csv')
df.rename(columns = {'hispanic / latin':'hispanic_latin'}, inplace = True)
df.columns = df.columns.str.replace(" ", "_")

cols = df.columns.to_list()
for index, col in enumerate(cols):
    cols[index] +=' numeric'
    
cols = ', '.join(cols)

## Creación de la tabla
___

In [5]:
conn = psycopg2.connect(dbname=dbname, user=user, host=host, port=5432, password=password)

cursor = conn.cursor()

#Eliminando las tablas train_cupid y test_cupid si existen.
cursor.execute("DROP TABLE IF EXISTS train_cupid;")
cursor.execute("DROP TABLE IF EXISTS test_cupid;")

# Obteniendo un Cursor para las tablas
name_table_train = f"train_cupid ({cols})"
name_table_test = f"test_cupid ({cols})"

# Creación de las sentecias para las tablas
sqlTable_train = "create table "+name_table_train+';'
sqlTable_test = "create table "+name_table_test+';'

# Creando las tablas en PostgreSQL
cursor.execute(sqlTable_train)
cursor.execute(sqlTable_test)
conn.commit()


## Importación de los datos del CSV a las Tablas creadas

In [6]:
with open('train_cupid.csv', 'r') as f:    
    next(f) # Saltar la fila de los encabezados del CSV.
    cursor.copy_from(f, 'train_cupid', sep=',')

with open('test_cupid.csv', 'r') as f:    
    next(f) # Saltar la fila de los encabezados del CSV.
    cursor.copy_from(f, 'test_cupid', sep=',')

conn.commit()

## Añadiendo columna indice a las tablas train_cupid y test_cupid

In [7]:
cursor.execute('ALTER TABLE train_cupid ADD indice SERIAL PRIMARY KEY;')
cursor.execute('ALTER TABLE test_cupid ADD indice SERIAL PRIMARY KEY;')
conn.commit()

# Parte 2

## Importando datos de train_cupid con Pandas

In [8]:
df = pd.read_sql('SELECT * FROM train_cupid', conn, index_col='indice')
df

  df = pd.read_sql('SELECT * FROM train_cupid', conn, index_col='indice')


Unnamed: 0_level_0,age,height,virgo,taurus,scorpio,pisces,libra,leo,gemini,aries,...,orientation_straight,sex_m,smokes_sometimes,smokes_trying_to_quit,smokes_when_drinking,smokes_yes,body_type_overweight,body_type_regular,education_high_school,education_undergrad_university
indice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,35.0,70.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,38.0,68.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,23.0,71.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,29.0,66.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,29.0,67.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20077,33.0,63.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20078,22.0,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
20079,28.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20080,31.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20081 entries, 1 to 20081
Data columns (total 98 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             20081 non-null  float64
 1   height                          20081 non-null  float64
 2   virgo                           20081 non-null  float64
 3   taurus                          20081 non-null  float64
 4   scorpio                         20081 non-null  float64
 5   pisces                          20081 non-null  float64
 6   libra                           20081 non-null  float64
 7   leo                             20081 non-null  float64
 8   gemini                          20081 non-null  float64
 9   aries                           20081 non-null  float64
 10  aquarius                        20081 non-null  float64
 11  cancer                          20081 non-null  float64
 12  sagittarius                     

In [10]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from feature_engine.imputation import  MeanMedianImputer
from feature_engine.encoding import OrdinalEncoder, OneHotEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
import helpers


from sklearn import set_config
set_config(display='diagram')

## Vector objetivo: **single**

In [11]:
X = df.drop(columns=['single'])
y = df.single
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.2, random_state=42)

In [12]:
modelos = {
    'm1' : GradientBoostingClassifier(random_state=42),
    'm2' : AdaBoostClassifier(base_estimator=RandomForestClassifier(max_depth=1, n_estimators=5), random_state=42, n_estimators=100, learning_rate=1),
    'm3' : RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10),
    'm4' : SVC(random_state=42, probability=True),
    'm5' : DecisionTreeClassifier(random_state=42),
    'm6' : LogisticRegression(random_state=42, C=0.01),
    'm7' : BernoulliNB()
}

## Preprocesamiento

In [13]:
prep = Pipeline(steps=[
    ('num_imp', MeanMedianImputer(imputation_method='mean')),
    ('ord', OrdinalEncoder(encoding_method='ordered', variables='age', ignore_format=True)),
    ('sc', SklearnTransformerWrapper(StandardScaler(), variables=['age', 'height']))
])

prep.fit_transform(X_train, y_train)

Unnamed: 0_level_0,age,height,virgo,taurus,scorpio,pisces,libra,leo,gemini,aries,...,orientation_straight,sex_m,smokes_sometimes,smokes_trying_to_quit,smokes_when_drinking,smokes_yes,body_type_overweight,body_type_regular,education_high_school,education_undergrad_university
indice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
938,-0.801779,-0.290518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
397,0.679797,0.434828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
13133,-0.366021,-0.290518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
16480,0.156888,-1.015863,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12838,-1.150385,1.401956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11285,-1.063233,0.434828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11965,-1.150385,0.193046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5391,-0.801779,0.676610,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
861,-0.801779,1.643738,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Fit y serialización de los modelos

In [14]:
for modelo in modelos.values():
    helpers.report_performance(modelo, X_train, X_valid, y_train, y_valid)


GradientBoostingClassifier
              precision    recall  f1-score   support

         0.0       0.99      0.53      0.69       299
         1.0       0.96      1.00      0.98      3718

    accuracy                           0.96      4017
   macro avg       0.98      0.77      0.84      4017
weighted avg       0.97      0.96      0.96      4017

AdaBoostClassifier
              precision    recall  f1-score   support

         0.0       0.99      0.52      0.68       299
         1.0       0.96      1.00      0.98      3718

    accuracy                           0.96      4017
   macro avg       0.98      0.76      0.83      4017
weighted avg       0.97      0.96      0.96      4017

RandomForestClassifier
              precision    recall  f1-score   support

         0.0       1.00      0.51      0.68       299
         1.0       0.96      1.00      0.98      3718

    accuracy                           0.96      4017
   macro avg       0.98      0.76      0.83      4017
weigh

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       299
         1.0       0.93      1.00      0.96      3718

    accuracy                           0.93      4017
   macro avg       0.46      0.50      0.48      4017
weighted avg       0.86      0.93      0.89      4017

DecisionTreeClassifier
              precision    recall  f1-score   support

         0.0       0.49      0.59      0.54       299
         1.0       0.97      0.95      0.96      3718

    accuracy                           0.92      4017
   macro avg       0.73      0.77      0.75      4017
weighted avg       0.93      0.92      0.93      4017

LogisticRegression
              precision    recall  f1-score   support

         0.0       1.00      0.51      0.68       299
         1.0       0.96      1.00      0.98      3718

    accuracy                           0.96      4017
   macro avg       0.98      0.76      0.83      4017
weighted avg       0.97      0.9

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Parte 3

## Importando datos de test_cupid con Pandas

In [15]:
df_test = pd.read_sql('SELECT * FROM test_cupid', conn, index_col='indice')
df_test

  df_test = pd.read_sql('SELECT * FROM test_cupid', conn, index_col='indice')


Unnamed: 0_level_0,age,height,virgo,taurus,scorpio,pisces,libra,leo,gemini,aries,...,orientation_straight,sex_m,smokes_sometimes,smokes_trying_to_quit,smokes_when_drinking,smokes_yes,body_type_overweight,body_type_regular,education_high_school,education_undergrad_university
indice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,22.0,75.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,32.0,65.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,24.0,67.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,29.0,62.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
5,39.0,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19939,48.0,73.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
19940,52.0,70.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19941,59.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19942,24.0,72.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [16]:
queries = {
    'Query 1' : ['atheism', 'asian', 'employed', 'pro_dogs', 'chinese'],
    'Query 2' : ['income_over_75', 'french', 'german', 'orientation_straight', 'new york'],
    'Query 3' : ['education_undergrad_university', 'body_type_regular', 'pro_dogs', 'employed'],
    'Query 4' : ['taurus', 'indian', 'washington', 'income_between_50_75', 'hinduism']
}

X_test = df.drop(columns=['single'])
y_test = df.single

import glob
fecha = '1211-19' # 1211-19 = 12 de Noviembre a las 19 horas.
archivos_de_modelos = glob.glob(f'*_{fecha}.pkl')

queries_result = {}
for archivo_modelo in archivos_de_modelos:
    for key, query in queries.items():
        queries_result[key] = helpers.create_crosstab(archivo_modelo, X_test, y_test, query)

queries_result


KeyError: 'new york'