In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix, ConfusionMatrixDisplay, accuracy_score

# Linear regression

In [None]:
#Create a dataset to demonstrate linear regression

In [None]:
x = np.arange(0,10,0.1)

In [None]:
x = np.arange(0,10,0.2)
delta = np.random.uniform(-1,2, size=(50,))
y = 0.7 * x + 1 + delta
plt.scatter(x,y,alpha=0.5)

In [None]:
reg = LinearRegression()

In [None]:
reg.fit(x.reshape(-1,1),y.reshape(-1,1))

In [None]:
x_test = np.arange(0,10,0.1).reshape(-1,1)
y_pred = reg.predict(x_test)

In [None]:
plt.scatter(x,y,alpha=0.5)
plt.plot(x_test, y_pred, color = 'r')

In [None]:
#non-linear relationship

In [None]:
delta = np.random.uniform(-3,5, size=(50,))
y= (x - 1) * (x-5)*(x-9)  + delta

In [None]:
plt.scatter(x,y, alpha = 0.5)

In [None]:
poly_df = pd.DataFrame({'x':x, 'x2':x*x, 'x3':x*x*x})

In [None]:
poly_df.head()

In [None]:
polyreg = LinearRegression()

In [None]:
polyreg.fit(poly_df, y)

In [None]:
y_pred = polyreg.predict(poly_df)

In [None]:
plt.scatter(x,y, alpha = 0.5)
plt.plot(x,y_pred, color = 'r')

In [None]:
np.sqrt(mean_squared_error(y, y_pred))

## Linear regression

We gaan een lineair regressie model trainen op een echte dataset

In [None]:
houseprice = pd.read_csv('../data/housing_prices.csv')

In [None]:
houseprice.head()

In [None]:
#Eerst printen we de kolommen, dan kunnen we goed zien wat we willen gebruiken
houseprice.columns

In [None]:
houseprice['price'].describe()

In [None]:
sns.pairplot(houseprice[['bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors','price']])

In [None]:
plt.scatter(houseprice['bedrooms'], houseprice['price'])

In [None]:
plt.scatter(houseprice['bathrooms'], houseprice['price'])

In [None]:
plt.scatter(houseprice['sqft_above'], houseprice['price'])

In [None]:
plt.scatter(houseprice['floors'], houseprice['price'])

We definieren het dataframe X met alle onafhankele variabelen (of features) en y met de afhankelijke variabele (of de target).

In [None]:
#Eerst printen we de kolommen, dan kunnen we goed zien wat we willen gebruiken
houseprice.columns

In [None]:
X = houseprice[['bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors']]
y = houseprice['price']

We gebruiken train_test_split van sklearn om dit te splitten in een train en test set, zie: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 42)

In [None]:
reg = LinearRegression()

In [None]:
reg.fit(X_train,y_train)

In [None]:
y_pred = reg.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
reg.coef_

In [None]:
#Linear regression with statsmodels

In [None]:
import statsmodels.api as sm

X = sm.add_constant(X) #add an intercept 
model2 = sm.OLS(y, X)
results2 = model2.fit()
print(results2.summary())  

In [None]:
X_ = houseprice[['bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot']]
y = houseprice['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 42)

In [None]:
reg.fit(X_train,y_train)

In [None]:
reg.fit(X_train,y_train)

In [None]:
y_pred = reg.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

# Decision Trees


## Data preparation

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier


First we load the preprocessed, numeric data and derive the binary target variable.

In [None]:
df = pd.read_csv("../data/adult_reconstruction.numeric.csv")
df.head()

In [None]:
df["income_above_50K"] = df["income"] > 50_000

# the resulting cell values are booleans, we convert them to integers (0 or 1)
df["income_above_50K"] = df["income_above_50K"].astype(int)

del df["income"]
df.head()

In [None]:
X = df.drop(columns=["income_above_50K"])  # numpy array without the outcome variable
y = df["income_above_50K"]  # numpy array with the outcome variable only

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## A simple Decision Tree

For this we use the standard sklearn implementation, thus the API is very straightforward (create an instance of the classifier, fit and predict).

In case of a single tree, we can also plot the structure.

In [None]:
model = DecisionTreeClassifier(max_depth=3, random_state=42)
model.fit(X_train, y_train)

plt.figure(figsize=(10, 10))
_ = plot_tree(model, feature_names=df.columns[:-1], filled=True)

In [None]:
y_pred = model.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
ConfusionMatrixDisplay(cm).plot()

In [None]:
accuracy_score(y_test,y_pred)

## Random Forest

The use of a random forest classifier is just as simple with sklearn, the only thing that has to be change is the initialization of the model.

In [None]:
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)

In [None]:
y_pred = rfc.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
ConfusionMatrixDisplay(cm).plot()

In [None]:
accuracy_score(y_test, y_pred)

Here we have multiple tree grouped into a forest object, but the individual trees can still be accessed.

In [None]:
# we redefine the classifier with the max depth of 3 in order to make the plotting of the first tree sensible
rfc_md3 = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=3)
rfc_md3.fit(X_train, y_train)
single_tree = rfc_md3.estimators_[0]
plt.figure(figsize=(20, 20))
_ = plot_tree(single_tree)

In [None]:
rfc.feature_importances_

In [None]:
plt.bar(X_train.columns, rfc.feature_importances_)

In [None]:
plt.figure(figsize=(15, 15))
plt.xticks(rotation=45)
plt.bar(X_train.columns, rfc.feature_importances_)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lg = LogisticRegression()


In [None]:
lg.fit(X_train, y_train)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(X_train)

In [None]:
lg.fit(scaler.transform(X_train),y_train)

In [None]:
scaler.fit(X_test)

In [None]:
y_pred = lg.predict(scaler.transform(X_test))

In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
ConfusionMatrixDisplay(cm).plot()

In [None]:
accuracy_score(y_test, y_pred)