# Regression Homework

# All Important libs Necessary

In [3]:
# for data manipulation and analysis
import pandas as pd
import numpy as np

#for data visualizations
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.pylabtools import figsize
from matplotlib.lines import lineStyles
from networkx.algorithms.bipartite.basic import color
from scipy import stats
import scipy.stats as statsha
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_predict, cross_val_score

In [4]:
import warnings
warnings.filterwarnings("ignore")

## Helper Functions

Before I started the AIR Quality project, I have created helper functions for:
-   avoinding from repeating the same code
-   ensuring all models are evaluated with the same way
-   saving time and reducing errors

### Residual Analysis Plot

We can call this function for every model to consistenly evaluate the model performance

In [5]:
def plot_residuals(y_true, y_pred, model_name = ""):

    residuals = y_true - y_pred
    fig, axes = plt.subplots(1, 3, figsize(18, 5))

    # Residuals vs Predicted Values
    axes[0].scatter(y_pred, residuals, alpha = 0.6)
    axes[0].axhline(y = 0, color = 'red', linestyle = '--')
    axes[0].set_xlabels('Predicted Values')
    axes[0].set_ylabels('Residuals')
    axes[0].set_title(f"{model_name} - Residuals vs Predicted Values")

    # residuals distribution
    axes[1].hist(residuals, bins = 30, alpha = 0.7, edgecolor = 'black')
    axes[1].axvline(x = 0, color = 'red', lineStyle = '--')
    axes[1].set_xlabel('Residuals')
    axes[1].set_ylabel('Frequency')
    axes[1].set_title(f"{model_name} - Residuals Distribution")

#     Q - Q plot for normality check
    stats.probplot(residuals, dist="norm", plot=axes[2])
    axes[2].set_title(f'{model_name} - Q-Q Plot')

    plt.tight_layout()
    plt.show()

#     Printing Residentual statistics
    print(f"Residentual Statistics for {model_name} :\n")
    print(f"Mean of Residuals: {np.mean(residuals):.4f}")
    print(f"Standard Deviation of Residuals: {np.std(residuals):.4f}")
    print(f"Skewness of Residuals: {stats.skew(residuals):.4f}")

### Model Evaluation Metrics

After training each model we use this function to genreate comprehensive evaluation reports that explains why we choose this metric

In [6]:
def evaluation_metrics(model_name, y_true, y_pred, X_train = None, model = None):

    mse = mean_squared_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    # Adjusted R squared Error

    if X_train is not None and model is not None:
        n = len(y_true)
        p = X_train.shape[1]
        adj_r2 = (1 - (1 - r2) * (n - 1) / (n- p - 1))
    else:
        adj_r2 = None

    #   printing evalueation metrics
    print(f"Evaluation Metrics for {model_name}: \n")
    print(" - " * 50)
    print(f"\nR2 Score: {r2:.4f}")

    if adj_r2 is not None:
        print(f"Adjusted r2 Score: {adj_r2:.4f}")

    print(f"RMSE: {rmse:.4f} -  Standard Deviation of residuals")
    print(f"MAE: {mae:.4f} - Average Absolute Error")
    print(f"MSE: {mse:.4f} - Mean Squared Error")

    if X_train is not None and model is not None:
        cv_scores = cross_val_score(model, X_train, y_true, cv= 5, scoring = 'r2')
        print(f"Cross Validation Score: {cv_scores:.4f}")

    print(" = " * 50)
    return {
        'model' : model_name,
        'r2': r2,
        'adj_r2': adj_r2,
        'rmse': rmse,
        'mae': mae,
        'mse': mse
    }

### Lets read the data first

In [7]:
data_path = '/Users/sunnakh/Desktop/epam-training/regression/homework/data/AirQualityUCI.xlsx'
df = pd.read_excel(data_path)
df.head(100)

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10,18:00:00,2.6,1360.00,150,11.881723,1045.50,166.0,1056.25,113.0,1692.00,1267.50,13.600,48.875001,0.757754
1,2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.300,47.700000,0.725487
2,2004-03-10,20:00:00,2.2,1402.00,88,8.997817,939.25,131.0,1140.00,114.0,1554.50,1074.00,11.900,53.975000,0.750239
3,2004-03-10,21:00:00,2.2,1375.50,80,9.228796,948.25,172.0,1092.00,122.0,1583.75,1203.25,11.000,60.000000,0.786713
4,2004-03-10,22:00:00,1.6,1272.25,51,6.518224,835.50,131.0,1205.00,116.0,1490.00,1110.00,11.150,59.575001,0.788794
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2004-03-14,17:00:00,2.9,1437.75,156,12.033005,1050.75,180.0,942.50,128.0,1667.50,1206.25,21.300,30.750000,0.769615
96,2004-03-14,18:00:00,2.5,1477.75,122,12.163323,1055.25,160.0,929.25,121.0,1670.75,1262.25,19.650,36.700000,0.830706
97,2004-03-14,19:00:00,4.6,1807.50,262,20.571715,1312.25,261.0,753.25,157.0,1992.75,1697.75,18.375,41.725000,0.873196
98,2004-03-14,20:00:00,5.9,1898.00,341,23.142126,1381.25,325.0,680.50,173.0,2102.75,1904.75,17.625,46.099999,0.920954


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9357 entries, 0 to 9356
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           9357 non-null   datetime64[ns]
 1   Time           9357 non-null   object        
 2   CO(GT)         9357 non-null   float64       
 3   PT08.S1(CO)    9357 non-null   float64       
 4   NMHC(GT)       9357 non-null   int64         
 5   C6H6(GT)       9357 non-null   float64       
 6   PT08.S2(NMHC)  9357 non-null   float64       
 7   NOx(GT)        9357 non-null   float64       
 8   PT08.S3(NOx)   9357 non-null   float64       
 9   NO2(GT)        9357 non-null   float64       
 10  PT08.S4(NO2)   9357 non-null   float64       
 11  PT08.S5(O3)    9357 non-null   float64       
 12  T              9357 non-null   float64       
 13  RH             9357 non-null   float64       
 14  AH             9357 non-null   float64       
dtypes: datetime64[ns](1),

### Univariate Analysis

After loading the dataset and checking how many columns we have and what each feature roughly represents, the next step is to examine each feature on its own.

#### Why do we do this?

Mainly to build a basic understanding of the data. For example, we want to:

- identify which features are numerical and which are categorical;
- see whether any categorical features are ordinal (have a natural order);
- check if there are binary categorical features;
- look at the distribution of each feature;
- detect any obvious outliers;
- understand how each feature might relate to the target variable we want to predict;
- and explore anything else that might be useful depending on the project needs.

This list isn’t exhaustive the exact steps can vary depending on business problem and dataset.
