In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#SK-Learn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
#Scalers
#https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing
from sklearn.preprocessing import StandardScaler,MinMaxScaler
#Modelos Lineales
# https://scikit-learn.org/stable/modules/linear_model.html
from sklearn.linear_model import LinearRegression , Ridge
from sklearn.svm import SVR
#Metricas
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [2]:
data = pd.read_csv(r'C:\Users\Notebook Asus\Documents\Ciencia de datos\Xy_train.csv', delimiter=",")

In [3]:
data

Unnamed: 0,X,y
0,2.273360,6.054685
1,3.167583,4.581428
2,7.973655,5.392507
3,6.762547,3.108068
4,3.911096,4.225744
...,...,...
95,3.787495,5.133706
96,2.759471,4.308327
97,9.661041,11.923565
98,0.582026,1.335725


In [4]:
x = data.iloc[:,:-1].values
y = data.iloc[:,-1].values

In [5]:
print("Los primeros 5 valores del array x: \n", x[:5])
print("\n")
print("Los primeros 5 valores del array y: \n", y[:5])

Los primeros 5 valores del array x: 
 [[2.27336022]
 [3.1675834 ]
 [7.97365457]
 [6.76254671]
 [3.91109551]]


Los primeros 5 valores del array y: 
 [6.05468511 4.58142822 5.39250705 3.10806751 4.22574359]


In [6]:
np.shape(x)

(100, 1)

In [7]:
np.shape(y)

(100,)

In [8]:
dataT = pd.read_csv(r'C:\Users\Notebook Asus\Documents\Ciencia de datos\X_test.csv', delimiter=",")

In [9]:
dataT.head(5)

Unnamed: 0,X
0,6.1708
1,6.302022
2,8.689293
3,2.376897
4,2.70732


In [10]:
x_pred = dataT.iloc[:,:,].values

In [11]:
print("Los primeros 5 valores del array x train: \n", x_pred[:5])

Los primeros 5 valores del array x train: 
 [[6.17080018]
 [6.30202215]
 [8.68929334]
 [2.37689724]
 [2.70731977]]


In [12]:
np.shape(x_pred)

(20, 1)

In [40]:
# Separamos set de entrenamiento y testeo
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.4, random_state=1)

In [41]:
# Definimos el escalador 
scaler = StandardScaler()
# Fiteamos para los x_train
scaler = scaler.fit(x_train)

In [42]:
# Transformamos los x_train
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [43]:
# Vamos a crear un dataframe para guardar los resultados de cada estimador
results_df = pd.DataFrame(columns=['Model','Features','R2','MSE','MAE'])
results_df

Unnamed: 0,Model,Features,R2,MSE,MAE


# Regresion Lineal

In [44]:
#Definimos el estimador
est = LinearRegression()
#Definimos diccionario con hyperparametros
parameters = {'fit_intercept' : [False, True]}
#Definimos la cantidad de folds para validar
n_folds = 5
#Definimos el objeto grid search con los parametros anteriores
gs = GridSearchCV(est, param_grid=parameters, cv=n_folds, refit=True, scoring="neg_mean_squared_error")

In [45]:
#Entrenamos
gs.fit(x_train_scaled, y_train)

GridSearchCV(cv=5, estimator=LinearRegression(),
             param_grid={'fit_intercept': [False, True]},
             scoring='neg_mean_squared_error')

In [46]:
# Mostramos el mejor estimador, sus hyperparametros seleccionados y su resultado
print(gs.best_estimator_, "\n")
print(gs.best_params_, "\n")
print(gs.best_score_, "\n")

LinearRegression() 

{'fit_intercept': True} 

-3.5068629970764618 



In [47]:
# Usamos x_test para predecir con el mejor estimador
linear_prediction = gs.best_estimator_.predict(x_test_scaled)
#Calculamos las metricas R2, MSE y MAE
linear_r2 = r2_score(y_true=y_test, y_pred=linear_prediction)
linear_mse = mean_squared_error(y_true=y_test, y_pred=linear_prediction)
linear_mae = mean_absolute_error(y_true=y_test, y_pred=linear_prediction)

In [48]:
print(f'R2 score: {linear_r2:.6f}')
print(f'MAE: {linear_mae:.6f}')
print(f'MSE: {linear_mse:.6f}')

R2 score: 0.407764
MAE: 1.518342
MSE: 4.025814


In [49]:
#Guardamos los resultados
results_df = results_df.append({'Model':'Linear',
                                'Features':'Lineal',
                                'R2':linear_r2,
                                'MSE':linear_mse,
                                'MAE':linear_mae},ignore_index=True)
results_df

Unnamed: 0,Model,Features,R2,MSE,MAE
0,Linear,Lineal,0.407764,4.025814,1.518342


# Regresion Ridge

In [50]:
est = Ridge()
# Lista del parametro lambda (llamado 'alpha' en el GridSearch )
lambdas = [0.001,0.005,0.01,0.02,0.05,0.1,0.2,0.3,0.4,0.5,1]
# Juntamos el diccionario de parametros
parameters = {'alpha': lambdas}
# Definimos nuevamente el objeto GS con los parametros previamente definidos
gs = GridSearchCV(est, param_grid=parameters,refit=True, cv=n_folds, scoring="neg_mean_squared_error")

In [51]:
# Entrenamos
gs.fit(x_train_scaled, y_train)

GridSearchCV(cv=5, estimator=Ridge(),
             param_grid={'alpha': [0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2,
                                   0.3, 0.4, 0.5, 1]},
             scoring='neg_mean_squared_error')

In [52]:
# Mostramos el mejor estimador, sus hyperparametros seleccionados y su resultado
print(gs.best_estimator_, "\n")
print(gs.best_params_, "\n")
print(gs.best_score_, "\n")

Ridge(alpha=1) 

{'alpha': 1} 

-3.5042379309532685 



In [53]:
#Predicciones + Metricas
ridge_prediction = gs.best_estimator_.predict(x_test_scaled)
ridge_r2 = r2_score(y_true=y_test, y_pred=ridge_prediction)
ridge_mse = mean_squared_error(y_true=y_test, y_pred=ridge_prediction)
ridge_mae = mean_absolute_error(y_true=y_test, y_pred=ridge_prediction)

In [54]:
print(f'R2 score: {ridge_r2:.6f}')
print(f'MAE: {ridge_mse:.6f}')
print(f'MSE: {ridge_mae:.6f}')

R2 score: 0.405198
MAE: 4.043256
MSE: 1.518566


In [55]:
#Guardamos resultados
results_df = results_df.append({'Model':'Ridge',
                                'Features':'Lineal',
                                'R2':ridge_r2,
                                'MSE':ridge_mse,
                                'MAE':ridge_mae},ignore_index=True)

In [56]:
results_df.drop(results_df.index[2:5],0,inplace=True)

In [57]:
results_df

Unnamed: 0,Model,Features,R2,MSE,MAE
0,Linear,Lineal,0.407764,4.025814,1.518342
1,Ridge,Lineal,0.405198,4.043256,1.518566


# Regresion Support Vector

In [58]:
est = SVR(max_iter=25000)
# Lista del parametro lambda (parametro 'alpha')
parameters = {'C' : [1000,1500,2000,3000,5000],
              'epsilon' : [0.001, 0.01, 0.1,1, 10,100],
             'gamma':[0.001, 0.01, 0.1,1, 10,100]}
# Definimos nuevamente el objeto GS con los parametros previamente definidos
gs = GridSearchCV(est, param_grid=parameters,refit=True,
                  cv=n_folds, scoring="neg_mean_squared_error",
                  verbose=3, n_jobs=3)

In [59]:
# Entrenamos
gs.fit(x_train_scaled, y_train)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  28 tasks      | elapsed:    2.2s
[Parallel(n_jobs=3)]: Done 900 out of 900 | elapsed:    4.3s finished


GridSearchCV(cv=5, estimator=SVR(max_iter=25000), n_jobs=3,
             param_grid={'C': [1000, 1500, 2000, 3000, 5000],
                         'epsilon': [0.001, 0.01, 0.1, 1, 10, 100],
                         'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
             scoring='neg_mean_squared_error', verbose=3)

In [60]:
print(gs.best_estimator_, "\n")
print(gs.best_params_, "\n")
print(gs.best_score_, "\n")

SVR(C=1000, epsilon=0.01, gamma=1, max_iter=25000) 

{'C': 1000, 'epsilon': 0.01, 'gamma': 1} 

-0.6261346312572595 



In [61]:
# Prediccion + Metricas
svr_prediction = gs.best_estimator_.predict(x_test_scaled)
svr_r2 = r2_score(y_true=y_test, y_pred=svr_prediction)
svr_mse = mean_squared_error(y_true=y_test, y_pred=svr_prediction)
svr_mae = mean_absolute_error(y_true=y_test, y_pred=svr_prediction)

In [62]:
print(f'R2 score: {svr_r2:.6f}')
print(f'MAE: {svr_mae:.6f}')
print(f'MSE: {svr_mse:.6f}')

R2 score: 0.894489
MAE: 0.626631
MSE: 0.717226


In [63]:
results_df = results_df.append({'Model':'SVR',
                                'Features':'Linear',
                                'R2':svr_r2,
                                'MSE':svr_mse,
                                'MAE':svr_mae},ignore_index=True)

In [64]:
results_df

Unnamed: 0,Model,Features,R2,MSE,MAE
0,Linear,Lineal,0.407764,4.025814,1.518342
1,Ridge,Lineal,0.405198,4.043256,1.518566
2,SVR,Linear,0.894489,0.717226,0.626631


In [None]:
#### Elijo SVR para predecir los resultados de x_pred ####

In [66]:
print(gs.best_estimator_, "\n")
print(gs.best_params_, "\n")

SVR(C=1000, epsilon=0.01, gamma=1, max_iter=25000) 

{'C': 1000, 'epsilon': 0.01, 'gamma': 1} 



In [67]:
svr_prediction = gs.best_estimator_.predict(x_pred)

In [68]:
np.around(svr_prediction, decimals = 2)

array([10.26, 10.26, 10.26, 28.29, 31.47, 10.26, 10.26,  3.88, 10.26,
       31.54, 22.45, 10.26, 10.26,  3.23, 10.32, 10.26, 10.27, 10.26,
       10.26, 10.26])