<a href="https://colab.research.google.com/github/yyx462/ML/blob/main/regressionExample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn as sk
import math

# Load data

In [None]:
df = pd.read_csv(r"possum_train.csv", index_col=0)

FileNotFoundError: ignored

# Data Exploration

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
# look at the relation between head lenght and totlenth

In [None]:
sns.scatterplot(data = df, x = "hdlngth", y = "totlngth")

In [None]:
sns.boxplot(x="site", y="age", data=df ,palette="Set3"); 
plt.title("Boxplot showing the distribution of the age of possums from different sites.");

In [None]:
# handling missing data

In [None]:
df.info()

In [None]:
def plot_distribution(dataset, cols=5, width=20, height=15, hspace=0.2, wspace=0.5):
    plt.style.use('seaborn-whitegrid')
    fig = plt.figure(figsize=(width,height))
    fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=wspace, hspace=hspace)
    rows = math.ceil(float(dataset.shape[1]) / cols)
    for i, column in enumerate(dataset.columns):
        ax = fig.add_subplot(rows, cols, i + 1)
        ax.set_title(column)
        if dataset.dtypes[column] == np.object:
            g = sns.countplot(y=column, data=dataset)
            substrings = [s.get_text()[:18] for s in g.get_yticklabels()]
            g.set(yticklabels=substrings)
            plt.xticks(rotation=25)
        else:
            g = sns.histplot(dataset[column])
            plt.xticks(rotation=25)
print(df.columns)
# plot_distribution(df, cols=3, width=20, height=20, hspace=0.45, wspace=0.5)



In [None]:
sns.set(style='white', font_scale=1.6)
g = sns.PairGrid(df, aspect=1.4, diag_sharey=False)
g.map_lower(sns.regplot, lowess=True, ci=False, line_kws={'color': 'black'})
g.map_diag(sns.histplot, kde_kws={'color': 'black'})
g.map_upper(sns.regplot, lowess=True, ci=False, line_kws={'color': 'black'})
plt.show()

# Prepare dataset

In [None]:
# Some data is categorical, we must convert this to numerical data
# Maybe we want to only use a few features
# Split data into features and labels (targets)

In [None]:
df.shape

In [None]:
# remove nans

In [None]:
df.dropna(axis=0,inplace=True)

In [None]:
# column Pop and sex are not numerical
# we can either map Pop to 0, 1, 2 numbers, or make a 1 hot encoding

In [None]:
df_initial = df.copy() # keep a copy of the original so we don't have to reload the data

In [None]:
df['Pop'].unique()
df['sex'].unique()

In [None]:
df['sex']=df['sex'].map({"m":0,"f":1})

In [None]:
df['Pop']=df['Pop'].map({"other":0,"Vic":1})

In [None]:
# dropping site because I don't understand its intended meaning and it's sparsely populated
attributes_to_drop = ["site", "case"]
df.drop( attributes_to_drop,axis=1,inplace=True)

In [None]:
df

In [None]:
def process_data_frame(data_in):
    dataframe = data_in.copy()
    dataframe['sex']=dataframe['sex'].map({"m":0,"f":1})
    dataframe['Pop']=dataframe['Pop'].map({"other":0,"Vic":1})
    
    attributes_to_drop = ["site", "case"]
    dataframe.drop( attributes_to_drop,axis=1,inplace=True)
    # convert 'sex' and 'pop' to numerical values
    # drop irrelevant features
    return dataframe

In [None]:
def get_features_and_target(dataframe,target="totlngth"):
    label = dataframe[target].copy()
    features = dataframe.drop(target,axis=1).copy()
    return features, label

In [None]:
df_processed = process_data_frame(df_initial)

In [None]:
target = "totlngth"
X, y = get_features_and_target(df_processed, target)

# Set up model and run cross-validation

In [None]:
# suppose we have two types of model
# we have a LinearRegression
# and a RidgeRegression

In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from collections import defaultdict

kf = KFold(n_splits=5)

train_err = defaultdict(list)
crossval_err = defaultdict(list)


candidate_models= {'Model 1': LinearRegression(), 'Model 2':Ridge(),\
                   'Model 3': KernelRidge(kernel='poly')}

for model_name, candidate_model in candidate_models.items():
    print("Model type:", candidate_model)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model = candidate_model.fit(X_train, y_train)

        predictions_training = model.predict(X_train)
        predictions = model.predict(X_test)

        rmse_training = np.sqrt(mean_squared_error(predictions_training,y_train))
        rmse = np.sqrt(mean_squared_error(predictions,y_test))
        print(rmse_training, rmse)
        
        train_err[model_name].append(rmse_training)
        crossval_err[model_name].append(rmse)


In [None]:
for key in crossval_err.keys():
    print(f'Training error of {key}: {np.mean(train_err[key])}')
    print(f'Cross-val error of {key}: {np.mean(crossval_err[key])}')

In [None]:
# Which is the best model?

# Generate Kaggle submission

In [None]:
# load test data (without labels)

In [None]:
test_set = pd.read_csv(r"possum_test_no_labels.csv", index_col=0)

In [None]:
test_set

In [None]:
# transform test_set in the same format as train_data

In [None]:
processed_test_set = process_data_frame(test_set)

In [None]:
processed_test_set

In [None]:
# Train best model using the full training data
best_model = LinearRegression().fit(X, y)

In [None]:
prediction_test = best_model.predict(processed_test_set)

In [None]:
# Generate file to upload to Kaggle

In [None]:
processed_test_set['label']=prediction_test #adding the column to the test_set

In [None]:
processed_test_set['label'].to_csv('predictions_test.csv',index_label='ID') # save CSV that can be dumped