In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing # One-hot-Encoder y LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error 
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
#from urllib.request import urlretrieve  # pa descargar
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
sns.set_style("darkgrid")
plt.rcParams["font.size"] = 8
plt.rcParams["figure.figsize"] = (4,3)
plt.rcParams["figure.facecolor"] = "#00000000"

In [None]:
#url = "raw.gotihub.data"
#urlretrieve(url=url, "data.csv")
data = pd.read_csv("./dataset/medical-charges.csv")
data.head()

In [None]:
data.info()

In [None]:
data.smoker.value_counts()

In [None]:
data.describe()

In [None]:
data.hist()

In [None]:
# Age
data.age.describe()

In [None]:
fig = px.histogram(data, x = "age", marginal="box", nbins=47, title="Distributino of Age")
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
# BMI
fig = px.histogram(data, x="bmi", marginal="box", color_discrete_sequence=["red"], title= "Distribution of BMI")
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
# Charges
fig = px.histogram(data, x="charges", marginal="box", color="smoker", color_discrete_sequence=["yellow", "green"], title="Annual Medical Changes")
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
data.smoker.value_counts()

In [None]:
px.histogram(data, x="smoker", color="sex", title="Smokers")

In [None]:
# Age and Charges
fig = px.scatter(data, x="age", y="charges", color="smoker", opacity=0.8, hover_data=["sex"], title="Age vs Charges")
fig.update_traces(marker_size=5)
fig.show()

In [None]:
# BMI and Charges
fig = px.scatter(data, x="bmi", y="charges", color="smoker", opacity=0.8, hover_data=["sex"], title="BMI vs Charges")
fig.update_traces(marker_size=5)
fig.show()

In [None]:
# Correlation
data.charges.corr(data.age)

In [None]:
data.charges.corr(data.bmi)

In [None]:
smoker_values = {"no":0, "yes":1}
smoker_numeric = data.smoker.map(smoker_values)
data.charges.corr(smoker_numeric)

In [None]:
variable_num = ["age", "bmi", "children", "charges"]
data_num = data[variable_num]
data_num.corr()

In [None]:
sns.heatmap(data_num.corr(), cmap="Reds", annot=True)
plt.title("Correlation Matrix");

## Linear Regression using a Single Feature

In [None]:
non_smoker_df = data[data.smoker=="no"]

plt.title("Age vs Charges")
sns.scatterplot(non_smoker_df, x="age", y="charges", alpha=0.7, s=15);

In [None]:
model = LinearRegression()
help(model.fit)

In [None]:
inputs = non_smoker_df[["age"]]
targets = non_smoker_df[["charges"]]
print("input shape: ", inputs.shape)
print("target shape: ", targets.shape)

In [None]:
model.fit(inputs, targets) # b+ w*age 

In [None]:
model.predict(np.array([[23], [37], [61]]))

In [None]:
predictions = model.predict(inputs)
predictions

In [None]:
root_mean_squared_error(targets, predictions)

In [None]:
model.coef_ # w

In [None]:
model.intercept_ # b

In [None]:
def try_parameters(w, b):
    
    plt.plot(non_smoker_df.age, predictions, 'r', alpha=0.9);
    plt.scatter(non_smoker_df.age, non_smoker_df.charges, s=8,alpha=0.8);
    plt.xlabel('Age');
    plt.ylabel('Charges')
    plt.legend(['Prediction', 'Actual']);
    
    print("RMSE Loss: ", root_mean_squared_error(targets, predictions))

try_parameters(model.coef_, model.intercept_)

## Linear Regression Using Multiple Features

In [None]:
inputs, targets = non_smoker_df[["age", "bmi"]], non_smoker_df[["charges"]]

model = LinearRegression().fit(inputs, targets)

predictions = model.predict(inputs)

loss = root_mean_squared_error(targets, predictions)
print("Loss: ", loss)

In [None]:
fig = px.scatter(non_smoker_df, x='bmi', y='charges', title='BMI vs. Charges')
fig.update_traces(marker_size=5)
fig.show()

In [None]:
fig = px.scatter_3d(non_smoker_df, x='age', y='bmi', z='charges')
fig.update_traces(marker_size=3, marker_opacity=0.5)
fig.show()

In [None]:
model.coef_, model.intercept_

## Binary Categories

In [None]:
sns.barplot(data=data, x="smoker", y="charges", hue="smoker"); # Hue 

In [None]:
variable_string = {"no":0 , "yes":1}
data["smoker_num"] = data.smoker.map(variable_string)

inputs, targets = data[["age", "bmi", "children", "smoker_num"]], data[["charges"]]

model = LinearRegression().fit(inputs, targets)

predictions = model.predict(inputs)

loss = root_mean_squared_error(targets, predictions)
print("Loss: ", loss)

## Multiples Categories (one-hot-encoding) and Label-encoder

In [None]:
sns.barplot(data, x="region", y="charges", hue="region");

In [None]:
# Encode with variables categories
enc = preprocessing.OneHotEncoder()

# show labels
enc.fit(data[["region"]])
one_hot_region = enc.transform(data[["region"]]).toarray()
data[enc.categories_[0]] = one_hot_region # enc.categories_  son las etiquetas
data.head()


In [None]:
# Encode with variables how sex or smoker
enc = preprocessing.LabelEncoder()
label_encoder = enc.fit_transform(data["sex"])
data["sex"] = label_encoder
data.head()

## vuelve a sus labels [female y male]
#data["sex"] = enc.inverse_transform(data["sex"]) 

In [None]:
# inputs and targets
input_cols = ['age', 'bmi', 'children', 'smoker_num', 'sex', 'northeast', 'northwest', 'southeast', 'southwest']
inputs, targets = data[input_cols], data[["charges"]]

# create model
model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)
loss = root_mean_squared_error(targets, predictions)
print("Loss: ", loss)

In [None]:
# Features scaling
model.coef_, model.intercept_

In [None]:
weights_df = pd.DataFrame({"feature": np.append(input_cols,1),
                           "weight": np.append(model.coef_, model.intercept_)})

weights_df.sort_values("weight", ascending=False)

In [None]:
# StandardScaler
num_cols = ["age", "bmi", "children"]
scaler = StandardScaler()
scaler.fit(data[num_cols])


In [None]:
print(f"{num_cols} mean : {scaler.mean_}")
print(f"{num_cols} var : {scaler.var_}")

In [None]:
scaled_inputs = scaler.transform(data[num_cols])
scaled_inputs

In [None]:
# combine data scaler with categorical
cat_cols = ['smoker_num', 'sex', 'northeast', 'northwest', 'southeast', 'southwest']
categorical_data = data[cat_cols].values

inputs = np.concatenate((scaled_inputs, categorical_data), axis=1)
targets = data[["charges"]]

model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)
loss = root_mean_squared_error(targets, predictions)
print("Loss: ", loss)

In [None]:
weights_df = pd.DataFrame({"feature": np.append(num_cols + cat_cols,1),
                           "weight": np.append(model.coef_, model.intercept_)})

weights_df.sort_values("weight", ascending=False)

In [None]:
# Test set
inputs_train, inputs_test, targets_train, targets_test = train_test_split(inputs, targets, test_size=0.1)

model = LinearRegression().fit(inputs_train, targets_train)
predictions = model.predict(inputs_test)
loss = root_mean_squared_error(targets_test, predictions)
print("Test Loss: ", loss)

In [None]:
# Generate predictions
predictions_train = model.predict(inputs_train)
loss = root_mean_squared_error(targets_train, predictions_train)
print('Training Loss:', loss)