In [1]:
import pandas as pd
import matplotlib as mpl 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import missingno as msno
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from scipy import stats

In [2]:
url='https://raw.githubusercontent.com/Geerdata/DS/main/Datacoder/Arc.%20Modelo/insurance.csv'
df= pd.read_csv(url,sep=',')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
lista = ['sex', 'bmi', 'smoker', 'region']
categoricas = df [lista]
categoricas.head()
noCategoricas = df.drop(columns=lista)

In [4]:
copia = categoricas.copy()
copia.head()

Unnamed: 0,sex,bmi,smoker,region
0,female,27.9,yes,southwest
1,male,33.77,no,southeast
2,male,33.0,no,southeast
3,male,22.705,no,northwest
4,male,28.88,no,northwest


In [5]:
label_encoder = LabelEncoder()
for column in copia:
    label_encoder.fit([column])
    copia[column] = label_encoder.fit_transform(copia[column])


categoricas = copia.copy()
categoricas.head()

Unnamed: 0,sex,bmi,smoker,region
0,0,197,1,3
1,1,350,0,2
2,1,331,0,2
3,1,73,0,1
4,1,223,0,1


In [6]:
df_limpio = pd.concat ([categoricas, noCategoricas], axis = 1)
df_limpio.reset_index(drop=True, inplace=True)
df_limpio


Unnamed: 0,sex,bmi,smoker,region,age,children,charges
0,0,197,1,3,19,0,16884.92400
1,1,350,0,2,18,1,1725.55230
2,1,331,0,2,28,3,4449.46200
3,1,73,0,1,33,0,21984.47061
4,1,223,0,1,32,0,3866.85520
...,...,...,...,...,...,...,...
1333,1,276,0,1,50,3,10600.54830
1334,0,302,0,0,18,0,2205.98080
1335,0,422,0,2,18,0,1629.83350
1336,0,146,0,3,21,0,2007.94500


In [7]:
X = df_limpio[['sex', 'bmi','smoker', 'region','children', 'age']]
y = df_limpio['charges']

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 1)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [11]:
# Explicación de la varianza de las componentes
explained_variance = pca.explained_variance_ratio_
explained_variance

array([0.98957283])

In [12]:
X_train #pasamos de tener 7 a 2 variables para entrenar

array([[-236.80223866],
       [-151.79771177],
       [-139.73377321],
       ...,
       [-132.66219686],
       [ 277.04803882],
       [ -12.70501596]])

In [13]:
X_test #lo mismo pasa en el conjunto de prueba

array([[-132.82415326],
       [  -8.94456476],
       [ -92.5893671 ],
       [-118.81146637],
       [  37.84012938],
       [ 252.01272376],
       [-194.14172737],
       [ 181.38921003],
       [-261.02573242],
       [ -45.77983568],
       [-151.00555045],
       [   5.11658682],
       [ -21.02949199],
       [ 246.16123292],
       [ 232.3367166 ],
       [ 195.17770628],
       [ 250.19825696],
       [ 154.18089901],
       [ -61.81461633],
       [ -92.85083221],
       [-162.06698671],
       [  57.14003817],
       [   1.8455825 ],
       [ 213.86233451],
       [ -59.76633384],
       [  59.27290586],
       [ 116.34708094],
       [   1.94244028],
       [-100.80175986],
       [  -5.16763233],
       [ -28.89067067],
       [  36.31337341],
       [ 111.86050365],
       [-237.00147562],
       [ -10.0663474 ],
       [ 214.1487472 ],
       [-105.12120139],
       [ -12.85449619],
       [-155.73386347],
       [ 118.04680879],
       [ -47.15075045],
       [  22.878

In [25]:
#Entrenamiento del modelo
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [15]:
regressor.coef_

array([18.02607249])

In [16]:
regressor.intercept_

13346.089736364484

In [18]:
#Realizando las predicciones
y_pred = regressor.predict(X_test)

In [19]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df

Unnamed: 0,Actual,Predicted
764,9095.06825,10951.791922
887,5272.17580,13184.854364
890,29330.98315,11677.067094
1293,9301.89355,11204.385631
259,33750.29180,14028.198651
...,...,...
109,47055.53210,15425.996363
575,12222.89830,11801.893196
535,6067.12675,12211.948326
543,63770.42801,18307.921140


In [20]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 9818.380046740207
Mean Squared Error: 149580821.92930952
Root Mean Squared Error: 12230.32386853715


In [21]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.03650857245660866