In [51]:
import pandas as pd
import matplotlib as plt  
import statsmodels.api as sm
import scipy.stats as st
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error 
from sklearn.ensemble import RandomForestRegressor

##### Read data file from folder

In [52]:
data_set = pd.read_csv("Credit.csv")

In [53]:
data_set.head(10)

Unnamed: 0.1,Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
0,1,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333
1,2,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903
2,3,104.593,7075,514,4,71,11,Male,No,No,Asian,580
3,4,148.924,9504,681,3,36,11,Female,No,No,Asian,964
4,5,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331
5,6,80.18,8047,569,4,77,10,Male,No,No,Caucasian,1151
6,7,20.996,3388,259,2,37,12,Female,No,No,African American,203
7,8,71.408,7114,512,2,87,9,Male,No,No,Asian,872
8,9,15.125,3300,266,5,66,13,Female,No,No,Caucasian,279
9,10,71.061,6819,491,3,41,19,Female,Yes,Yes,African American,1350


##### Delete first column

In [54]:
data_set = data_set.drop("Unnamed: 0", axis = 1)

##### Some insights about the data

In [55]:
data_set.shape

(400, 11)

In [56]:
data_set.describe()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Balance
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,45.218885,4735.6,354.94,2.9575,55.6675,13.45,520.015
std,35.244273,2308.198848,154.724143,1.371275,17.249807,3.125207,459.758877
min,10.354,855.0,93.0,1.0,23.0,5.0,0.0
25%,21.00725,3088.0,247.25,2.0,41.75,11.0,68.75
50%,33.1155,4622.5,344.0,3.0,56.0,14.0,459.5
75%,57.47075,5872.75,437.25,4.0,70.0,16.0,863.0
max,186.634,13913.0,982.0,9.0,98.0,20.0,1999.0


In [57]:
data_set.isnull().sum()

Income       0
Limit        0
Rating       0
Cards        0
Age          0
Education    0
Gender       0
Student      0
Married      0
Ethnicity    0
Balance      0
dtype: int64

 90 cells have zero as a value in their target column

In [58]:
count_balance_values = data_set['Balance'].value_counts().head(5)
count_balance_values

0       90
133      3
1048     3
531      3
333      2
Name: Balance, dtype: int64

Check the maximum and minimum value of value in columns

In [59]:
max_balance = data_set['Balance'].max()
max_balance

1999

In [60]:
min_balance = data_set['Balance'].min()
min_balance

0

#### Subtask a

Used the function recommended in the task to map the categorical values to binary values

In [61]:
data_set = pd.get_dummies(data_set, columns = ['Gender', 'Ethnicity'])

In [62]:
data_set.head()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Student,Married,Balance,Gender_Female,Gender_Male,Ethnicity_African American,Ethnicity_Asian,Ethnicity_Caucasian
0,14.891,3606,283,2,34,11,No,Yes,333,0,1,0,0,1
1,106.025,6645,483,3,82,15,Yes,Yes,903,1,0,0,1,0
2,104.593,7075,514,4,71,11,No,No,580,0,1,0,1,0
3,148.924,9504,681,3,36,11,No,No,964,1,0,0,1,0
4,55.882,4897,357,2,68,16,No,Yes,331,0,1,0,0,1


To avoid adding two columns to the existing number of columns, the two boolean values in student and married were mapped to boolean values using the python replace function

In [63]:
data_set[['Student', 'Married']] = data_set[['Student', 'Married']].replace({'No': 0, 'Yes': 1})

In [64]:
data_set.head()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Student,Married,Balance,Gender_Female,Gender_Male,Ethnicity_African American,Ethnicity_Asian,Ethnicity_Caucasian
0,14.891,3606,283,2,34,11,0,1,333,0,1,0,0,1
1,106.025,6645,483,3,82,15,1,1,903,1,0,0,1,0
2,104.593,7075,514,4,71,11,0,0,580,0,1,0,1,0
3,148.924,9504,681,3,36,11,0,0,964,1,0,0,1,0
4,55.882,4897,357,2,68,16,0,1,331,0,1,0,0,1


#### Subtask b

Split the data into training and test data with 20% for test data as instructed. The train_test_split method from sklearn randomly splits the data into to parts and not in the given order in the data frame so that the data is representive.Therfore we don't have to shuffle them.

In [65]:
train, test = train_test_split(data_set, test_size = 0.20, random_state=1)

In [66]:
train.shape

(320, 14)

In [67]:
test.shape

(80, 14)

#### Subtask c

 Ordinary least squares is a method used for estimating the parameters of linear regression equation.

In [68]:
columns = ['Income','Rating', 'Cards','Age', 'Education', 'Student','Married','Gender_Female', 'Gender_Male','Ethnicity_African American','Ethnicity_Asian', 'Ethnicity_Caucasian']

In [69]:
train_columns = train[columns].to_numpy()

In [70]:
X = sm.add_constant(train_columns)

In [71]:
Y = train['Balance']

In [72]:
model = sm.OLS(Y,X) 
results = model.fit() 
results.params

const   -293.337323
x1        -7.659020
x2         3.933932
x3         3.248996
x4        -0.769736
x5         0.019458
x6       416.094243
x7        -4.870677
x8      -146.270822
x9      -147.066501
x10     -112.269516
x11      -84.315561
x12      -96.752246
dtype: float64

In [73]:
results.summary()

0,1,2,3
Dep. Variable:,Balance,R-squared:,0.954
Model:,OLS,Adj. R-squared:,0.952
Method:,Least Squares,F-statistic:,635.6
Date:,"Sun, 08 May 2022",Prob (F-statistic):,1.74e-199
Time:,01:11:42,Log-Likelihood:,-1915.3
No. Observations:,320,AIC:,3853.0
Df Residuals:,309,BIC:,3894.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-293.3373,18.958,-15.473,0.000,-330.640,-256.034
x1,-7.6590,0.262,-29.271,0.000,-8.174,-7.144
x2,3.9339,0.059,66.906,0.000,3.818,4.050
x3,3.2490,3.985,0.815,0.416,-4.593,11.091
x4,-0.7697,0.320,-2.407,0.017,-1.399,-0.140
x5,0.0195,1.770,0.011,0.991,-3.462,3.501
x6,416.0942,19.361,21.491,0.000,377.998,454.190
x7,-4.8707,11.450,-0.425,0.671,-27.401,17.660
x8,-146.2708,11.072,-13.211,0.000,-168.057,-124.485

0,1,2,3
Omnibus:,13.164,Durbin-Watson:,2.18
Prob(Omnibus):,0.001,Jarque-Bera (JB):,14.127
Skew:,0.513,Prob(JB):,0.000856
Kurtosis:,2.912,Cond. No.,3.65e+18


In [74]:
x_test = sm.add_constant(test[columns])
y_pred = results.predict(x_test)
y_pred.head()

398    -106.037684
125      -7.964884
328     280.149043
339    1026.983979
172     210.964531
dtype: float64

In [75]:
rmse = mean_squared_error(test['Balance'],y_pred, squared=False)
rmse

122.29847133142086

In [76]:
normalize_rmse = rmse / 1999
normalize_rmse

0.06117982557849968

#### Subtask d

In [77]:
columns_d = columns

In [78]:
columns_d.remove('Ethnicity_Caucasian')
columns_d

['Income',
 'Rating',
 'Cards',
 'Age',
 'Education',
 'Student',
 'Married',
 'Gender_Female',
 'Gender_Male',
 'Ethnicity_African American',
 'Ethnicity_Asian']

In [79]:
train_columns_d = train[columns_d].to_numpy()

In [80]:
X_d = sm.add_constant(train_columns_d)

In [81]:
Y_d = train['Balance']

In [82]:
model_d = sm.OLS(Y_d,X_d) 
results_d = model_d.fit() 
results_d.params

const   -357.838821
x1        -7.659020
x2         3.933932
x3         3.248996
x4        -0.769736
x5         0.019458
x6       416.094243
x7        -4.870677
x8      -178.521571
x9      -179.317250
x10      -15.517270
x11       12.436685
dtype: float64

In [83]:
results_d.summary()


0,1,2,3
Dep. Variable:,Balance,R-squared:,0.954
Model:,OLS,Adj. R-squared:,0.952
Method:,Least Squares,F-statistic:,635.6
Date:,"Sun, 08 May 2022",Prob (F-statistic):,1.74e-199
Time:,01:11:42,Log-Likelihood:,-1915.3
No. Observations:,320,AIC:,3853.0
Df Residuals:,309,BIC:,3894.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-357.8388,23.584,-15.173,0.000,-404.244,-311.434
x1,-7.6590,0.262,-29.271,0.000,-8.174,-7.144
x2,3.9339,0.059,66.906,0.000,3.818,4.050
x3,3.2490,3.985,0.815,0.416,-4.593,11.091
x4,-0.7697,0.320,-2.407,0.017,-1.399,-0.140
x5,0.0195,1.770,0.011,0.991,-3.462,3.501
x6,416.0942,19.361,21.491,0.000,377.998,454.190
x7,-4.8707,11.450,-0.425,0.671,-27.401,17.660
x8,-178.5216,13.149,-13.577,0.000,-204.394,-152.649

0,1,2,3
Omnibus:,13.164,Durbin-Watson:,2.18
Prob(Omnibus):,0.001,Jarque-Bera (JB):,14.127
Skew:,0.513,Prob(JB):,0.000856
Kurtosis:,2.912,Cond. No.,6.74e+17


In [84]:
x_test_d = sm.add_constant(test[columns_d])
y_pred_d = results_d.predict(x_test_d)

In [85]:
rmse_d = mean_squared_error(test['Balance'],y_pred_d, squared=False)
rmse_d

122.29847133142093

In [86]:
normalize_rmse_d = rmse_d / 1999
normalize_rmse_d

0.061179825578499714

In [87]:
rmse_d - rmse

7.105427357601002e-14

#### Subtask e

In [88]:
def calculate_OLS(list_columns):
    
    X_ols = sm.add_constant(train[list_columns].to_numpy())
    y = train['Balance']
    mod = sm.OLS(y, X_ols)
    res = mod.fit()
    return res.summary()

In [89]:
list1 = ['Income', 'Limit', 'Age']
list2 =  ['Income', 'Rating', 'Age']
list3 = ['Income', 'Limit', 'Rating', 'Age']

In [90]:
res1 = calculate_OLS(list1)
res1

0,1,2,3
Dep. Variable:,Balance,R-squared:,0.875
Model:,OLS,Adj. R-squared:,0.874
Method:,Least Squares,F-statistic:,739.4
Date:,"Sun, 08 May 2022",Prob (F-statistic):,1.87e-142
Time:,01:11:42,Log-Likelihood:,-2073.6
No. Observations:,320,AIC:,4155.0
Df Residuals:,316,BIC:,4170.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-352.9687,34.628,-10.193,0.000,-421.099,-284.838
x1,-7.1274,0.421,-16.945,0.000,-7.955,-6.300
x2,0.2597,0.006,40.785,0.000,0.247,0.272
x3,-0.7208,0.516,-1.398,0.163,-1.735,0.294

0,1,2,3
Omnibus:,81.62,Durbin-Watson:,1.832
Prob(Omnibus):,0.0,Jarque-Bera (JB):,147.833
Skew:,1.409,Prob(JB):,7.91e-33
Kurtosis:,4.774,Cond. No.,20200.0


In [91]:
res2 = calculate_OLS(list2)
res2

0,1,2,3
Dep. Variable:,Balance,R-squared:,0.882
Model:,OLS,Adj. R-squared:,0.881
Method:,Least Squares,F-statistic:,784.5
Date:,"Sun, 08 May 2022",Prob (F-statistic):,4.99e-146
Time:,01:11:42,Log-Likelihood:,-2065.3
No. Observations:,320,AIC:,4139.0
Df Residuals:,316,BIC:,4154.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-486.3902,35.341,-13.763,0.000,-555.924,-416.856
x1,-7.1582,0.409,-17.493,0.000,-7.963,-6.353
x2,3.8612,0.092,42.063,0.000,3.681,4.042
x3,-0.8263,0.502,-1.645,0.101,-1.815,0.162

0,1,2,3
Omnibus:,87.81,Durbin-Watson:,1.897
Prob(Omnibus):,0.0,Jarque-Bera (JB):,174.31
Skew:,1.443,Prob(JB):,1.41e-38
Kurtosis:,5.177,Cond. No.,1590.0


In [92]:
res3 = calculate_OLS(list3) 
res3

0,1,2,3
Dep. Variable:,Balance,R-squared:,0.882
Model:,OLS,Adj. R-squared:,0.881
Method:,Least Squares,F-statistic:,589.2
Date:,"Sun, 08 May 2022",Prob (F-statistic):,8.15e-145
Time:,01:11:42,Log-Likelihood:,-2064.7
No. Observations:,320,AIC:,4139.0
Df Residuals:,315,BIC:,4158.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-460.7138,42.150,-10.930,0.000,-543.646,-377.782
x1,-7.1879,0.410,-17.536,0.000,-7.994,-6.381
x2,0.0543,0.049,1.117,0.265,-0.041,0.150
x3,3.0646,0.719,4.261,0.000,1.650,4.480
x4,-0.8018,0.503,-1.595,0.112,-1.791,0.187

0,1,2,3
Omnibus:,88.701,Durbin-Watson:,1.882
Prob(Omnibus):,0.0,Jarque-Bera (JB):,174.694
Skew:,1.467,Prob(JB):,1.16e-38
Kurtosis:,5.12,Cond. No.,25300.0


Interpretation of results: 
1) Coefficients: For each variable, it is the measurement of how change in that variable affects the independent variable. It is the slope in ‘y = mx + b
2) Confidence interval is 95% as default, so threshold of signifigance is 0.5.
3)For the feature 'Age', for example,  we can't reject the null hypothesis that the feature and the target value are not independant. That is because its value is larger than 0.05.
4) list2 and list3 have similar R-squared value and are better than list1. The result of value of the R-squared value means the relationship between the three variables in the list and the target variable explains 88% and at the same time it is close to its adjusted R-squared value, which means no feature is irrelevant regarding the prediction.

#### Subtask f

Corrleation is a statistal measure that accesses the linear relationship between two values.
Examples of correlation tests:
- Pearson Korrelation 
- Spearman Korrelation 
- Kendalls Tau

All three methods have a range from -1 to 1. A value of 1 indicates that two variables are positively corrleated, -1 that the are negatively correlated. 0 there is no correlation.
Pearson will return a value of 1 if two values are linearly correlated. If the values are monotonically correlated than Spearman is a better method. Spearman can detect linearly and monotically corrleated variables. Kendalls Tau is a similar test to Spearman.

In [93]:
corr_pearson = data_set.corr()

In [94]:
corr_pearson.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Student,Married,Balance,Gender_Female,Gender_Male,Ethnicity_African American,Ethnicity_Asian,Ethnicity_Caucasian
Income,1.0,0.792088,0.791378,-0.018273,0.175338,-0.027692,0.019632,0.035652,0.463656,-0.010738,0.010738,0.040132,-0.017137,-0.019701
Limit,0.792088,1.0,0.99688,0.010231,0.100888,-0.023549,-0.006015,0.031155,0.861697,0.009397,-0.009397,0.03632,-0.032427,-0.003081
Rating,0.791378,0.99688,1.0,0.053239,0.103165,-0.030136,-0.002028,0.036751,0.863625,0.008885,-0.008885,0.037598,-0.035999,-0.00107
Cards,-0.018273,0.010231,0.053239,1.0,0.042948,-0.051084,-0.026164,-0.009695,0.086456,-0.022658,0.022658,0.000878,0.005591,-0.005631
Age,0.175338,0.100888,0.103165,0.042948,1.0,0.003619,-0.029844,-0.073136,0.001835,0.004015,-0.004015,0.061169,-0.059623,-0.000822
Education,-0.027692,-0.023549,-0.030136,-0.051084,0.003619,1.0,0.072085,0.048911,-0.008062,-0.005049,0.005049,0.013827,0.029586,-0.037725
Student,0.019632,-0.006015,-0.002028,-0.026164,-0.029844,0.072085,1.0,-0.076974,0.259018,0.055034,-0.055034,0.001931,0.053534,-0.048334
Married,0.035652,0.031155,0.036751,-0.009695,-0.073136,0.048911,-0.076974,1.0,-0.005673,0.012452,-0.012452,-0.102707,0.088595,0.011418
Balance,0.463656,0.861697,0.863625,0.086456,0.001835,-0.008062,0.259018,-0.005673,1.0,0.021474,-0.021474,0.01372,-0.009812,-0.003288
Gender_Female,-0.010738,0.009397,0.008885,-0.022658,0.004015,-0.005049,0.055034,0.012452,0.021474,1.0,-1.0,-0.014288,0.025425,-0.009831


According to correlation matrix that is visualized above, the features that are correlated with 'Balance' the most are limit and rating. They are positively correlated to the Balance. 

In [95]:
corr_spearman = data_set.corr(method = 'spearman') 
corr_spearman.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Student,Married,Balance,Gender_Female,Gender_Male,Ethnicity_African American,Ethnicity_Asian,Ethnicity_Caucasian
Income,1.0,0.657411,0.653274,-0.054929,0.147892,-0.046078,0.009779,0.01231,0.361291,0.005026,-0.005026,0.020293,-0.041974,0.019075
Limit,0.657411,1.0,0.99576,-0.000837,0.064122,-0.03281,0.009562,0.033353,0.889449,0.024112,-0.024112,0.01332,-0.038646,0.022192
Rating,0.653274,0.99576,1.0,0.043463,0.067763,-0.041426,0.009887,0.041686,0.889222,0.022812,-0.022812,0.015552,-0.041378,0.022647
Cards,-0.054929,-0.000837,0.043463,1.0,0.046673,-0.052447,-0.027004,-0.019804,0.064288,-0.010133,0.010133,0.009928,0.001174,-0.009593
Age,0.147892,0.064122,0.067763,0.046673,1.0,0.011273,-0.028296,-0.077186,-0.01068,0.008537,-0.008537,0.060164,-0.056912,-0.002317
Education,-0.046078,-0.03281,-0.041426,-0.052447,0.011273,1.0,0.077753,0.044063,-0.006463,-0.002873,0.002873,0.009097,0.028218,-0.03245
Student,0.009779,0.009562,0.009887,-0.027004,-0.028296,0.077753,1.0,-0.076974,0.245478,0.055034,-0.055034,0.001931,0.053534,-0.048334
Married,0.01231,0.033353,0.041686,-0.019804,-0.077186,0.044063,-0.076974,1.0,-0.001028,0.012452,-0.012452,-0.102707,0.088595,0.011418
Balance,0.361291,0.889449,0.889222,0.064288,-0.01068,-0.006463,0.245478,-0.001028,1.0,0.032225,-0.032225,0.008376,-0.021557,0.011563
Gender_Female,0.005026,0.024112,0.022812,-0.010133,0.008537,-0.002873,0.055034,0.012452,0.032225,1.0,-1.0,-0.014288,0.025425,-0.009831


In [96]:
corr_kendall = data_set.corr(method = 'kendall') 
corr_kendall.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Student,Married,Balance,Gender_Female,Gender_Male,Ethnicity_African American,Ethnicity_Asian,Ethnicity_Caucasian
Income,1.0,0.47736,0.473671,-0.040239,0.09855,-0.033921,0.007994,0.010064,0.250656,0.004109,-0.004109,0.01659,-0.034315,0.015594
Limit,0.47736,1.0,0.947251,-0.001287,0.041427,-0.023127,0.007818,0.027269,0.716424,0.019714,-0.019714,0.01089,-0.031596,0.018144
Rating,0.473671,0.947251,1.0,0.030216,0.044283,-0.029293,0.008091,0.034112,0.718157,0.018667,-0.018667,0.012727,-0.03386,0.018532
Cards,-0.040239,-0.001287,0.030216,1.0,0.03321,-0.041765,-0.02423,-0.01777,0.048508,-0.009092,0.009092,0.008908,0.001054,-0.008607
Age,0.09855,0.041427,0.044283,0.03321,1.0,0.006612,-0.023315,-0.0636,-0.007351,0.007035,-0.007035,0.049574,-0.046894,-0.001909
Education,-0.033921,-0.023127,-0.029293,-0.041765,0.006612,1.0,0.066235,0.037535,-0.004779,-0.002447,0.002447,0.007749,0.024038,-0.027643
Student,0.007994,0.007818,0.008091,-0.02423,-0.023315,0.066235,1.0,-0.076974,0.20478,0.055034,-0.055034,0.001931,0.053534,-0.048334
Married,0.010064,0.027269,0.034112,-0.01777,-0.0636,0.037535,-0.076974,1.0,-0.000858,0.012452,-0.012452,-0.102707,0.088595,0.011418
Balance,0.250656,0.716424,0.718157,0.048508,-0.007351,-0.004779,0.20478,-0.000858,1.0,0.026883,-0.026883,0.006987,-0.017983,0.009646
Gender_Female,0.004109,0.019714,0.018667,-0.009092,0.007035,-0.002447,0.055034,0.012452,0.026883,1.0,-1.0,-0.014288,0.025425,-0.009831


All three show the same correlations with difference in the value of the corrlation, but all have high corellations between the same values.

#### Subtask g

In [97]:
list_states = [1, 33, 135, 123, 99, 22, 100]

In [98]:
all_lists = [list1, list, list3]

In [102]:
def create_forest(list_states): 
    result = [] 
    for l in all_lists:
        inner = [] 
        for val in list_states: 
            X = train[l].to_numpy()
            #print(X.shape)
            y = train['Balance'] 
            rf = RandomForestRegressor(random_state = val) 
            rf.fit(X,y) 
            y_pred= rf.predict(test[l])
            sqr = mean_squared_error(y_pred, test['Balance'])
            inner.append(sqr)
        result.append(inner)
    return result
                                                  

In [103]:
create_forest(list_states)



[[38949.1022375,
  38757.419825000004,
  37358.242906249994,
  38897.991799999996,
  38245.20931375,
  39529.274866249994,
  38346.645053750006],
 [451.8151562500001,
  535.3746662500002,
  637.8586812500005,
  741.6010887500003,
  541.5142362500001,
  659.7987224999991,
  497.11505500000004],
 [37577.587396250005,
  37567.40571,
  38429.423196250005,
  37378.34696625,
  37549.796665,
  37815.87270749999,
  38441.44925125]]

Different values according to the random states

#### Subtask h

Multicollinearity refers to the condition in which two or more independant variables (predictors) are highly correlated. This means one variable can be used to predict the other. It is considered a problem because one cannot identify which variable has the most effect on the target variable. Morever, this may cause a poor estimation of the regression coeffcients.

Variance inflation factor (VIF) can be used to detect multicollinearity by identifying correlations between variables and determine the strength of the relationships.

Possible solutions: 
1) Remove some highly correlated variables 
2) Combine the independant variables linearly 
3) Use forms of regression analysis that can handle multicollinearity, such as: LASSO and ridge regressions.