In [None]:
import psycopg2
import pandas as pd

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

GCP_PROJECT_ID = v
SERVICE_ACCOUNT_FILE = os.getenv('SERVICE_ACCOUNT_FILE')
STORAGE_BUCKET_NAME = os.getenv('STORAGE_BUCKET_NAME')

user = os.getenv('POSTGRES_USER')
password = os.getenv('POSTGRES_PASSWORD')
host = os.getenv('POSTGRES_DBHOST')
dbname = os.getenv('POSTGRES_DBNAME')

# Creación de la Base de Datos
___

In [None]:
conn = psycopg2.connect(user=user, host=host, port=5432, password=password)
conn.set_session(autocommit=True)

# Obtención de Cursor
cursor          = conn.cursor();
name_Database   = "prueba";

# Eliminación de base de datos en caso de existir
cursor.execute("DROP DATABASE IF EXISTS prueba;")

# Creación de sentencia para la base de datos
sqlCreateDatabase = "create database "+ name_Database +";"

# Creacion de la base de datos en PostgreSQL
cursor.execute(sqlCreateDatabase);

In [None]:
cursor.close()
conn.close()

# Obtención y tratamiento de las columnas del CSV
___

In [None]:
df = pd.read_csv('train_cupid.csv')
df.rename(columns = {'hispanic / latin':'hispanic_latin'}, inplace = True)
df.columns = df.columns.str.replace(" ", "_")

cols = df.columns.to_list()
for index, col in enumerate(cols):
    cols[index] +=' numeric'
    
cols = ', '.join(cols)

# Creación de la tabla
___

In [None]:
conn = psycopg2.connect(dbname=dbname, user=user, host=host, port=5432, password=password)

cursor = conn.cursor()

#Eliminando la tabla train_cupid si existe.
cursor.execute("DROP TABLE IF EXISTS train_cupid;")

# Obteniendo un Cursor para la tabla
name_table= f"train_cupid ({cols})"

# Creación de la sentecia para la tabla 
sqlTable = "create table "+name_table+';'

# Creando la tabla en PostgreSQL
cursor.execute(sqlTable)
conn.commit()


# Importación de los datos del CSV a la Tabla creada

In [None]:
with open('train_cupid.csv', 'r') as f:    
    next(f) # Saltar la fila de los encabezados del CSV.
    cursor.copy_from(f, 'train_cupid', sep=',')

conn.commit()

# Añadiendo columna indice a la tabla train_cupid

In [None]:
cursor.execute('ALTER TABLE train_cupid ADD indice SERIAL PRIMARY KEY;')
conn.commit()

# Importando datos de train_cupid con Pandas

In [None]:
df = pd.read_sql('SELECT * FROM train_cupid', conn, index_col='indice')
df

In [None]:
df.info()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from feature_engine.imputation import  MeanMedianImputer
from feature_engine.encoding import OrdinalEncoder, OneHotEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

from sklearn import set_config
set_config(display='diagram')

# Vector objetivo: **single**

In [None]:
X = df.drop(columns=['single'])
y = df.single
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [None]:
m1= GradientBoostingClassifier(random_state=42)
m2= AdaBoostClassifier(base_estimator=RandomForestClassifier(max_depth=1, n_estimators=5), random_state=42, n_estimators=100, learning_rate=1)
m3= RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10)
m4= SVC(random_state=42, probability=True)
m5= DecisionTreeClassifier(random_state=42)
m6= LogisticRegression(random_state=42, C=0.01)
m7= BernoulliNB()

# Preprocesamiento

In [None]:
prep = Pipeline(steps=[
    ('num_imp', MeanMedianImputer(imputation_method='mean')),
    ('ord', OrdinalEncoder(encoding_method='ordered', variables='age', ignore_format=True)),
    ('sc', SklearnTransformerWrapper(StandardScaler(), variables=['age', 'height']))
])

prep.fit_transform(X_train, y_train)

In [None]:
params = {
        'prep__num_imp__imputation_method': ['median', 'mean'],
        #'prep__oe__encoding_method': ['ordered', 'arbitrary'],
        'model__n_estimators': [10, 50, 100, 200],
        'model__learning_rate': [0.01, 0.1, 0.5, 1],
        'model__max_depth': [1, 2, 3, 4, 5]
}



In [None]:
pipe_m1 = Pipeline(steps=[
    ('prep', prep),
    ('model', m1)
])

search = GridSearchCV(pipe_m1, params, scoring='accuracy', cv= 5, n_jobs=-1)
search

In [None]:
search.fit(X_train, y_train)
print(f'Best Score {search.best_score_}')
print(f'Best Params {search.best_params_}')
results = pd.DataFrame(search.cv_results_)