# Parte 1

## Creación de la Base de Datos
___

In [1]:
import psycopg2
import pandas as pd
import os
from dotenv import load_dotenv

load_dotenv()

user = os.getenv('POSTGRES_USER')
password = os.getenv('POSTGRES_PASSWORD')
host = os.getenv('POSTGRES_DBHOST')
dbname = os.getenv('POSTGRES_DBNAME')
# print(f'user_{user}_password_{password}_host_{host}_dbname_{dbname}')

In [2]:
conn = psycopg2.connect(user=user, host=host, port=5432, password=password)
conn.set_session(autocommit=True)

# Obtención de Cursor
cursor          = conn.cursor();
name_Database   = "prueba";

# Eliminación de base de datos en caso de existir
cursor.execute("DROP DATABASE IF EXISTS prueba;")

# Creación de sentencia para la base de datos
sqlCreateDatabase = "create database "+ name_Database +";"

# Creacion de la base de datos en PostgreSQL
cursor.execute(sqlCreateDatabase);

In [3]:
cursor.close()
conn.close()

## Obtención y tratamiento de las columnas del CSV
___

In [4]:
df = pd.read_csv('train_cupid.csv')
df.rename(columns = {'hispanic / latin':'hispanic_latin'}, inplace = True)
df.columns = df.columns.str.replace(" ", "_")

cols = df.columns.to_list()
for index, col in enumerate(cols):
    cols[index] +=' numeric'
    
cols = ', '.join(cols)

## Creación de la tabla
___

In [5]:
conn = psycopg2.connect(dbname=dbname, user=user, host=host, port=5432, password=password)

cursor = conn.cursor()

#Eliminando las tablas train_cupid y test_cupid si existen.
cursor.execute("DROP TABLE IF EXISTS train_cupid;")
cursor.execute("DROP TABLE IF EXISTS test_cupid;")

# Obteniendo un Cursor para las tablas
name_table_train = f"train_cupid ({cols})"
name_table_test = f"test_cupid ({cols})"

# Creación de las sentecias para las tablas
sqlTable_train = "create table "+name_table_train+';'
sqlTable_test = "create table "+name_table_test+';'

# Creando las tablas en PostgreSQL
cursor.execute(sqlTable_train)
cursor.execute(sqlTable_test)
conn.commit()


## Importación de los datos del CSV a las Tablas creadas

In [6]:
with open('train_cupid.csv', 'r') as f:    
    next(f) # Saltar la fila de los encabezados del CSV.
    cursor.copy_from(f, 'train_cupid', sep=',')

with open('test_cupid.csv', 'r') as f:    
    next(f) # Saltar la fila de los encabezados del CSV.
    cursor.copy_from(f, 'test_cupid', sep=',')

conn.commit()

## Añadiendo columna indice a las tablas train_cupid y test_cupid

In [7]:
cursor.execute('ALTER TABLE train_cupid ADD indice SERIAL PRIMARY KEY;')
cursor.execute('ALTER TABLE test_cupid ADD indice SERIAL PRIMARY KEY;')
conn.commit()

# Parte 2

## Importando datos de train_cupid con Pandas

In [8]:
df = pd.read_sql('SELECT * FROM train_cupid', conn, index_col='indice')
df

  df = pd.read_sql('SELECT * FROM train_cupid', conn, index_col='indice')


Unnamed: 0_level_0,age,height,virgo,taurus,scorpio,pisces,libra,leo,gemini,aries,...,orientation_straight,sex_m,smokes_sometimes,smokes_trying_to_quit,smokes_when_drinking,smokes_yes,body_type_overweight,body_type_regular,education_high_school,education_undergrad_university
indice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,35.0,70.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,38.0,68.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,23.0,71.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,29.0,66.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,29.0,67.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20077,33.0,63.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20078,22.0,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
20079,28.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20080,31.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20081 entries, 1 to 20081
Data columns (total 98 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             20081 non-null  float64
 1   height                          20081 non-null  float64
 2   virgo                           20081 non-null  float64
 3   taurus                          20081 non-null  float64
 4   scorpio                         20081 non-null  float64
 5   pisces                          20081 non-null  float64
 6   libra                           20081 non-null  float64
 7   leo                             20081 non-null  float64
 8   gemini                          20081 non-null  float64
 9   aries                           20081 non-null  float64
 10  aquarius                        20081 non-null  float64
 11  cancer                          20081 non-null  float64
 12  sagittarius                     

In [10]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from feature_engine.imputation import  MeanMedianImputer
from feature_engine.encoding import OrdinalEncoder, OneHotEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
import helpers

from sklearn import set_config
set_config(display='diagram')

## Vector objetivo: **single**

In [11]:
targets = ['single', 'seeing_someone', 'available']
data = {}
for target in targets:
    X = df.drop(columns=[target])
    y = df[target]
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.2, random_state=42)
    data[target] = {
        'X': X,
        'y': y,
        'X_train': X_train,
        'X_valid': X_valid,
        'y_train': y_train,
        'y_valid': y_valid,
    }

In [12]:
modelos = {
    'm1' : GradientBoostingClassifier(random_state=42),
    'm2' : AdaBoostClassifier(base_estimator=RandomForestClassifier(max_depth=1, n_estimators=5), random_state=42, n_estimators=100, learning_rate=1),
    'm3' : RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10),
    'm4' : SVC(random_state=42, probability=True),
    'm5' : DecisionTreeClassifier(random_state=42),
    'm6' : LogisticRegression(random_state=42, C=0.01),
    'm7' : BernoulliNB()
}

## Preprocesamiento

In [13]:
prep = Pipeline(steps=[
    ('num_imp', MeanMedianImputer(imputation_method='mean')),
    ('ord', OrdinalEncoder(encoding_method='ordered', variables='age', ignore_format=True)),
    ('sc', SklearnTransformerWrapper(StandardScaler(), variables=['age', 'height']))
])

## Fit y serialización de los modelos

In [14]:
for target in targets:
    print(target)
    for modelo in modelos.values():
        model_f = {'prep':prep, 'classifier':modelo}
        helpers.report_performance(
            helpers.pipeline_maker(**model_f),
            str(modelo.__class__).replace("'>", '').split('.')[-1],
            target,
            data[target]['X_train'],
            data[target]['X_valid'],
            data[target]['y_train'],
            data[target]['y_valid']
        )


single
GradientBoostingClassifier
El modelo no se entrenó porque ya existía el archivo ./models/single__GradientBoostingClassifier__1211-21.pkl
AdaBoostClassifier
El modelo no se entrenó porque ya existía el archivo ./models/single__AdaBoostClassifier__1211-21.pkl
RandomForestClassifier
El modelo no se entrenó porque ya existía el archivo ./models/single__RandomForestClassifier__1211-21.pkl
SVC
El modelo no se entrenó porque ya existía el archivo ./models/single__SVC__1211-21.pkl
DecisionTreeClassifier
El modelo no se entrenó porque ya existía el archivo ./models/single__DecisionTreeClassifier__1211-21.pkl
LogisticRegression
El modelo no se entrenó porque ya existía el archivo ./models/single__LogisticRegression__1211-21.pkl
BernoulliNB
El modelo no se entrenó porque ya existía el archivo ./models/single__BernoulliNB__1211-21.pkl
seeing_someone
GradientBoostingClassifier
El modelo no se entrenó porque ya existía el archivo ./models/seeing_someone__GradientBoostingClassifier__1211-21.pk

# Parte 3

## Importando datos de test_cupid con Pandas

In [15]:
df_test = pd.read_sql('SELECT * FROM test_cupid', conn, index_col='indice')
df_test

  df_test = pd.read_sql('SELECT * FROM test_cupid', conn, index_col='indice')


Unnamed: 0_level_0,age,height,virgo,taurus,scorpio,pisces,libra,leo,gemini,aries,...,orientation_straight,sex_m,smokes_sometimes,smokes_trying_to_quit,smokes_when_drinking,smokes_yes,body_type_overweight,body_type_regular,education_high_school,education_undergrad_university
indice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,22.0,75.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,32.0,65.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,24.0,67.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,29.0,62.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
5,39.0,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19939,48.0,73.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
19940,52.0,70.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19941,59.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19942,24.0,72.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [16]:
import psycopg2.extras as extras
def execute_values(conn, df, table):

    tuples = [tuple(x) for x in df.to_numpy()]

    cols = ','.join(list(df.columns))
    # SQL query to execute
    query = "INSERT INTO %s(%s) VALUES %%s" % (table, cols)
    print(f'\t\t{query}')
    cursor = conn.cursor()
    try:
        extras.execute_values(cursor, query, tuples)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    print("\t\tthe dataframe is inserted")
    cursor.close()


In [17]:
queries = {
    'Query 1' : ['atheism', 'asian', 'employed', 'pro_dogs', 'chinese'],
    'Query 2' : ['income_over_75', 'french', 'german', 'orientation_straight', 'new_york'],
    'Query 3' : ['education_undergrad_university', 'body_type_regular', 'pro_dogs', 'employed'],
    'Query 4' : ['taurus', 'indian', 'washington', 'income_between_50_75', 'hinduism']
}
cursor_test = conn.cursor()

for target in targets:
    print(target)
    X_test = df_test.drop(columns=[target])
    y_test = df_test[target]

    import glob
    import datetime
    # 1211-19 = 12 de Noviembre a las 19 horas.
    fecha = datetime.datetime.now().strftime('%d%m-%H')

    archivos_de_modelos = glob.glob(f'./models/{target}*_{fecha}.pkl')

    queries_result = {}
    for key, query in queries.items():
        print(f'\t{key}({query})')
        for archivo_modelo in archivos_de_modelos:
            print(f'\t\tModelo: {archivo_modelo}')
            queries_result_for_this_model, target_name, model_name = helpers.create_crosstab(archivo_modelo, X_test, y_test, query)
            nombre_tabla = f"{target}_{model_name.lower()}_{key.lower().replace(' ','')}"
            cols = ' numeric ,'.join(list(queries_result_for_this_model.index.names))+' numeric'
            cursor_test.execute(f"DROP TABLE IF EXISTS {nombre_tabla};")
            cursor_test.execute(f"CREATE TABLE {nombre_tabla} ({cols}, {target}_yhat numeric);")
            conn.commit()
            execute_values(conn, queries_result_for_this_model.reset_index(), nombre_tabla)


single
	Query 1(['atheism', 'asian', 'employed', 'pro_dogs', 'chinese'])
		Modelo: ./models\single__AdaBoostClassifier__1211-21.pkl
		INSERT INTO single_adaboostclassifier_query1(atheism,asian,employed,pro_dogs,chinese,single_yhat) VALUES %s
the dataframe is inserted
		Modelo: ./models\single__BernoulliNB__1211-21.pkl
		INSERT INTO single_bernoullinb_query1(atheism,asian,employed,pro_dogs,chinese,single_yhat) VALUES %s
the dataframe is inserted
		Modelo: ./models\single__DecisionTreeClassifier__1211-21.pkl
		INSERT INTO single_decisiontreeclassifier_query1(atheism,asian,employed,pro_dogs,chinese,single_yhat) VALUES %s
the dataframe is inserted
		Modelo: ./models\single__GradientBoostingClassifier__1211-21.pkl
		INSERT INTO single_gradientboostingclassifier_query1(atheism,asian,employed,pro_dogs,chinese,single_yhat) VALUES %s
the dataframe is inserted
		Modelo: ./models\single__LogisticRegression__1211-21.pkl
		INSERT INTO single_logisticregression_query1(atheism,asian,employed,pro_dogs