# Lab - feature selection

In [1]:
import pandas as pd
import numpy as np
import statistics as stats
import seaborn as sns
import matplotlib.pyplot as plt
import math
from sklearn.preprocessing import PowerTransformer, StandardScaler, minmax_scale, OneHotEncoder
%matplotlib inline

from scipy.stats import iqr
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

In [2]:
df = pd.read_csv('Data_Marketing_Customer_Analysis_Round3.csv')

* **removing useless columns**

In [3]:
df.drop(['effective_to_date', 'month'], axis=1, inplace=True)

* **remove outliers function**

In [4]:
def remove_outliers(df):
    for c in df.columns:
            pct_75 = np.percentile(df[c], 75)
            pct_25 = np.percentile(df[c], 25)
            upper_bound = pct_75 + 1.5*iqr(df[c])
            lower_bound = pct_25 - 1.5*iqr(df[c])
            condition = (df[c] < upper_bound) & (df[c] > lower_bound)
            df[c] = df[c][condition]
    return df

* **filling nas**

In [5]:
df.isnull().values.any()

False

* **X,y split**

In [6]:
X = df.drop('total_claim_amount', axis=1)
y = df[['total_claim_amount']]

* **train test split**

In [7]:
tt_ratio = 0.3
rand_seed = 40

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=tt_ratio, 
                                                    random_state=rand_seed)

* **standardizing the data**

In [8]:
#split into numeric and categorical
numeric = df.select_dtypes(include=[np.number])
categorical = df.select_dtypes(include=[object])

#get numeric and cat columns' names
numeric_columns_X = list(numeric.drop('total_claim_amount', axis=1).columns)
categorical_columns_X = list(categorical.columns)


In [9]:
#creating a transformer
pt = PowerTransformer()
ct = ColumnTransformer([('pt', pt, numeric_columns_X),
                        ('dm', OneHotEncoder(drop='first'), categorical_columns_X)],
                       remainder='drop', verbose_feature_names_out=True, verbose=True).fit(X_train)

[ColumnTransformer] ............ (1 of 2) Processing pt, total=   0.0s
[ColumnTransformer] ............ (2 of 2) Processing dm, total=   0.0s


In [10]:
#applying transformer
X_train_ct = pd.DataFrame(ct.transform(X_train), columns=ct.get_feature_names_out())
X_test_ct = pd.DataFrame(ct.transform(X_test), columns=ct.get_feature_names_out())

* **creating a model**

In [11]:
#adding a constant
X_train_ct_const = sm.add_constant(X_train_ct.to_numpy())
X_test_ct_const = sm.add_constant(X_test_ct)

In [12]:
#model
model = sm.OLS(y_train, X_train_ct_const).fit()

print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:     total_claim_amount   R-squared:                       0.739
Model:                            OLS   Adj. R-squared:                  0.738
Method:                 Least Squares   F-statistic:                     449.0
Date:                Mon, 20 Feb 2023   Prob (F-statistic):               0.00
Time:                        16:53:40   Log-Likelihood:                -48108.
No. Observations:                7482   AIC:                         9.631e+04
Df Residuals:                    7434   BIC:                         9.664e+04
Df Model:                          47                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         55.4892     15.806      3.511      0.0

* **evaluation**

In [13]:
#model prediction values
y_pred_test = pd.DataFrame(model.predict(X_test_ct_const), columns=['claim_amount']) 
y_pred_train = pd.DataFrame(model.predict(X_train_ct_const), columns=['claim_amount'])

In [14]:
#MSE, MAE
print(mse(y_test,y_pred_test))
print(mae(y_test,y_pred_test))
print(mse(y_train,y_pred_train))

21161.05836268643
99.56643410548158
22513.476259874635


In [15]:
#RMSE
print(math.sqrt(mse(y_test,y_pred_test)))

145.46841018821382


In [16]:
#R2
R2=r2_score(y_test, y_pred_test)
R2

0.7469018733957821

In [17]:
#adjusted R2
model = LinearRegression()    # model
model.fit(X_train_ct, y_train)   # model train


R2_test = model.score(X_test_ct, y_test)
R2_train = model.score(X_train_ct, y_train)

Adj_R2 = 1 - (1-R2) * (len(y_test)-1) / (len(y_test) - X_test.shape[1]-1)
Adj_R2

0.745233094539051

In [18]:
#features importances
features_importances = pd.DataFrame(data={'Attribute': X_train_ct.columns,
                                          'Importance': abs(model.coef_.reshape(len(X_train_ct.columns),))})

features_importances = features_importances.sort_values(by='Importance', ascending=False)
features_importances

Unnamed: 0,Attribute,Importance
26,dm__policy_type_personal auto,168985600000000.0
31,dm__policy_personal l2,168985600000000.0
32,dm__policy_personal l3,168985600000000.0
30,dm__policy_personal l1,168985600000000.0
27,dm__policy_type_special auto,10721670000000.0
34,dm__policy_special l2,10721670000000.0
35,dm__policy_special l3,10721670000000.0
33,dm__policy_special l1,10721670000000.0
42,dm__vehicle_class_luxury car,460.9929
43,dm__vehicle_class_luxury suv,457.0996


## Week 7 Day 1 - Feature selection

In [19]:
from sklearn.linear_model import Lasso, Ridge 

#### OLS

In [20]:
model = LinearRegression()
model.fit(X_train_ct, y_train)
print(model.score(X_train_ct, y_train))
print(model.score(X_test_ct, y_test))

0.7394800796926333
0.7468957919551384


#### Lasso

In [21]:
model=Lasso(alpha=0.05)
model.fit(X_train_ct, y_train)
print(model.score(X_train_ct, y_train))
print(model.score(X_test_ct, y_test))

0.7394360186946489
0.7471403938699099


#### Ridge

In [22]:
model=Ridge(alpha=1000)
model.fit(X_train_ct, y_train)
print(model.score(X_train_ct, y_train))
print(model.score(X_test_ct, y_test))

0.6254605227552439
0.6238933715251345


#### RFE

In [29]:
from sklearn.feature_selection import RFE

lm = LinearRegression()

selector = RFE(lm, n_features_to_select=8, step=1, verbose=1)
selector.fit(X_train_ct, y_train)

kept_features = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train_ct.iloc[:,kept_features].columns)

X_train = selector.transform(X_train_ct)
X_test  = selector.transform(X_test_ct)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final selected features: ")
X_train


Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 fe

Unnamed: 0,pt__monthly_premium_auto,dm__coverage_premium,dm__employment_status_unemployed,dm__location_code_suburban,dm__location_code_urban,dm__marital_status_single,dm__vehicle_class_luxury car,dm__vehicle_class_luxury suv
0,1.028939,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.871857,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,2.105524,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.743503,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,-0.701676,0.0,1.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
7477,0.715980,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7478,-1.081895,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7479,-1.345973,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7480,-1.539842,0.0,0.0,0.0,1.0,0.0,0.0,0.0
