# Lab - machine uprising

In [1]:
import pandas as pd
import numpy as np
import statistics as stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import PowerTransformer, StandardScaler, minmax_scale, OneHotEncoder
%matplotlib inline

from scipy.stats import iqr
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

In [2]:
df = pd.read_csv('Data_Marketing_Customer_Analysis_Round3.csv')

* **removing useless columns**

In [3]:
df.drop(['effective_to_date', 'month'], axis=1, inplace=True)

* **remove outliers function**

In [4]:
def remove_outliers(df):
    for c in df.columns:
            pct_75 = np.percentile(df[c], 75)
            pct_25 = np.percentile(df[c], 25)
            upper_bound = pct_75 + 1.5*iqr(df[c])
            lower_bound = pct_25 - 1.5*iqr(df[c])
            condition = (df[c] < upper_bound) & (df[c] > lower_bound)
            df[c] = df[c][condition]
    return df

* **filling nas**

In [5]:
df.isnull().values.any()

False

* **X,y split**

In [6]:
X = df.drop('total_claim_amount', axis=1)
y = df[['total_claim_amount']]

* **train test split**

In [7]:
tt_ratio = 0.3
rand_seed = 34

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=tt_ratio, 
                                                    random_state=rand_seed)

* **standardizing the data**

In [8]:
#split into numeric and categorical
numeric = df.select_dtypes(include=[np.number])
categorical = df.select_dtypes(include=[object])

#get numeric and cat columns' names
numeric_columns_X = list(numeric.drop('total_claim_amount', axis=1).columns)
categorical_columns_X = list(categorical.columns)


In [9]:
#creating a transformer
pt = PowerTransformer()
ct = ColumnTransformer([('pt', pt, numeric_columns_X),
                        ('dm', OneHotEncoder(drop='first'), categorical_columns_X)],
                       remainder='drop', verbose_feature_names_out=True, verbose=True).fit(X_train)

[ColumnTransformer] ............ (1 of 2) Processing pt, total=   0.0s
[ColumnTransformer] ............ (2 of 2) Processing dm, total=   0.0s


In [10]:
#applying transformer
X_train_ct = pd.DataFrame(ct.transform(X_train), columns=ct.get_feature_names_out())
X_test_ct = pd.DataFrame(ct.transform(X_test), columns=ct.get_feature_names_out())

* **creating a model**

In [11]:
#adding a constant
X_train_ct_const = sm.add_constant(X_train_ct.to_numpy())
X_test_ct_const = sm.add_constant(X_test_ct)

#model
model = sm.OLS(y_train, X_train_ct_const).fit()

print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:     total_claim_amount   R-squared:                       0.744
Model:                            OLS   Adj. R-squared:                  0.742
Method:                 Least Squares   F-statistic:                     459.6
Date:                Wed, 01 Feb 2023   Prob (F-statistic):               0.00
Time:                        17:03:42   Log-Likelihood:                -47976.
No. Observations:                7482   AIC:                         9.605e+04
Df Residuals:                    7434   BIC:                         9.638e+04
Df Model:                          47                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         73.4122     15.425      4.759      0.0

* **evaluation**

In [12]:
#model prediction values
y_pred_test = pd.DataFrame(model.predict(X_test_ct_const), columns=['claim_amount']) 
y_pred_train = pd.DataFrame(model.predict(X_train_ct_const), columns=['claim_amount'])

In [13]:
#MSE, RMSE, MAE
print(mse(y_test,y_pred_test))
print(mae(y_test,y_pred_test))
print(mse(y_train,y_pred_train))

22917.184070194886
102.09328435477033
21728.63184990571


In [14]:
#R2
R2=r2_score(y_test, y_pred_test)
R2

0.7372070555211103

In [15]:
#adjusted R2
model=LinearRegression()    # model
model.fit(X_train_ct, y_train)   # model train


R2_test = model.score(X_test_ct, y_test)
R2_train = model.score(X_train_ct, y_train)

Adj_R2 = 1 - (1-R2) * (len(y_test)-1) / (len(y_test) - X_test.shape[1]-1)
Adj_R2

0.7354743547882825

In [17]:
#features importances
features_importances = pd.DataFrame(data={'Attribute': X_train_ct.columns,
                                          'Importance': abs(model.coef_.reshape(len(X_train_ct.columns),))})

features_importances = features_importances.sort_values(by='Importance', ascending=False)
features_importances

Unnamed: 0,Attribute,Importance
34,dm__policy_special l2,40538310000000.0
27,dm__policy_type_special auto,40538310000000.0
35,dm__policy_special l3,40538310000000.0
33,dm__policy_special l1,40538310000000.0
30,dm__policy_personal l1,1275840000000.0
26,dm__policy_type_personal auto,1275840000000.0
32,dm__policy_personal l3,1275840000000.0
31,dm__policy_personal l2,1275840000000.0
42,dm__vehicle_class_luxury car,463.684
43,dm__vehicle_class_luxury suv,417.9406
