# Machine Learning Models

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pandas_profiling import ProfileReport

In [None]:
## classification ml:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

In [None]:
## regression ml
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree.export import export_text
from sklearn.neighbors import KNeighborsRegressor

In [None]:
### evaluation
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
### optimize model
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA, FastICA

In [None]:
ds = pd.read_csv('./data/datamax_cleaned.csv', index_col=0)

In [None]:
#subsets
background = ['PB140: YEAR OF BIRTH','PB150: SEX','PB190: MARITAL STATUS','PB200: CONSENSUAL UNION', 'PB220A: CITIZENSHIP']

education = ['PE010: CURRENT EDUCATION ACTIVITY', 'PE040: HIGHEST ISCED LEVEL ATTAINED']

work = ['PL031: SELF-DEFINED CURRENT ECONOMIC STATUS', 'PL035: WORKED AT LEAST 1 HOUR DURING THE PREVIOUS WEEK', 'PL051: OCCUPATION (ISCO-08 (COM))', 'PL150: MANAGERIAL POSITION']

health = ['PH010: GENERAL HEALTH',
 'PH020: SUFFER FROM ANY A CHRONIC (LONG-STANDING) ILLNESS OR CONDITION',
 'PH030: LIMITATION IN ACTIVITIES BECAUSE OF HEALTH PROBLEMS',
 'PH040: UNMET NEED FOR MEDICAL EXAMINATION OR TREATMENT',
 'PH060: UNMET NEED FOR DENTAL EXAMINATION OR TREATMENT']

job= ['PY010G_PY020G_PY021G_PY050G: EMPLOYEE INCOME',
       'PY035G: CONTRIBUTIONS TO INDIVIDUAL PRIVATE PENSION PLANS',
      'PY080G: PENSION FROM INDIVIDUAL PRIVATE PLANS',
      "PY090G++: SOCIAL BENEFITS"]
jobdiff=['PY010G_PY020G_PY021G_PY050G: EMPLOYEE INCOME',
       'PY035G: CONTRIBUTIONS TO INDIVIDUAL PRIVATE PENSION PLANS',
      'PY080G: PENSION FROM INDIVIDUAL PRIVATE PLANS',
       'PY090G: UNEMPLOYMENT BENEFITS', 'PY100G: OLD-AGE BENEFITS',
       'PY110G: SURVIVOR’ BENEFITS', 'PY120G: SICKNESS BENEFITS',
       'PY130G: DISABILITY BENEFITS', 'PY140G: EDUCATION-RELATED ALLOWANCES']


bneeds= ['PD020: Replace worn-out clothes by some new (not second-hand) ones',
       'PD030: Two pairs of properly fitting shoes',
       'PD050: Get-together with friends/family (relatives) for a drink/meal at least once a month',
       'PD060: Regularly participate in a leisure activity',
       'PD070: Spend a small amount of money each week on yourself',
       'PD080: Internet connection for personal use at home']


sat =['PW010: OVERALL LIFE SATISFACTION', 'PW020: MEANING OF LIFE',
       'PW030: SATISFACTION WITH FINANCIAL SITUATION',
       'PW040: SATISFACTION WITH ACCOMMODATION', 'PW050: BEING VERY NERVOUS',
       'PW060: FEELING DOWN IN THE DUMPS', 'PW070: FEELING CALM AND PEACEFUL',
       'PW080: FEELING DOWNHEARTED OR DEPRESSED', 'PW090: BEING HAPPY',
       'PW120: SATISFACTION WITH TIME USE',
       'PW130: TRUST IN THE POLITICAL SYSTEM',
       'PW140: TRUST IN THE LEGAL SYSTEM', 'PW150: TRUST IN THE POLICE',
       'PW160: SATISFACTION WITH PERSONAL RELATIONSHIPS',
       'PW170: PERSONAL MATTERS (ANYONE TO DISCUSS WITH)',
       'PW180: HELP FROM OTHERS', 'PW190: TRUST IN OTHERS',
       'PW200: SATISFACTION WITH RECREATIONAL OR GREEN AREAS',
       'PW210: SATISFACTION WITH LIVING ENVIRONMENT',
       'PW220: PHYSICAL SECURITY']

subsets= [background,education,work,health,job,bneeds, sat]
columns = []
for x in subsets:
    for y in x:
        columns.append (y)
columns

# 0 Superfunctions

In [None]:
# refresh ds
def reload():
    return pd.read_csv('./data/datamax_cleaned.csv', index_col=0)

## 0.1 binning

In [None]:
def pbin(column_to_bin, x):
    bin_labels = [x for x in range(0,x)]

    equal_bins = pd.cut(ds[column_to_bin], x, labels = bin_labels)
    return equal_bins

def qbin(column_to_bin, x):
    bin_labels = [x for x in range(0,x)]

    equal_bins = pd.qcut(ds[column_to_bin], x, labels = bin_labels)
    return equal_bins

In [None]:
binned =pbin('PY010G_PY020G_PY021G_PY050G: EMPLOYEE INCOME',5)

In [None]:
binned.hist()

## 0.2 generate train and test

In [None]:
### generate tests with several binning
# binning first
def generateTest (X_interested,y_interested, dummy, drop_na_in_y, scalertouse):
    outcome = ds[y_interested]
    features = ds[X_interested]
    outcomelist=y_interested
    featureslist=X_interested
    data =pd.merge(features, outcome, left_index=True, right_index=True)
    
    # create dummy
    categorical= ['PB150: SEX', 'PB190: MARITAL STATUS',
       'PB200: CONSENSUAL UNION', 'PB220A: CITIZENSHIP',
       'PE010: CURRENT EDUCATION ACTIVITY',
       'PL031: SELF-DEFINED CURRENT ECONOMIC STATUS',
       'PL035: WORKED AT LEAST 1 HOUR DURING THE PREVIOUS WEEK',
       'PL051: OCCUPATION (ISCO-08 (COM))', 'PL150: MANAGERIAL POSITION',
        'PH030: LIMITATION IN ACTIVITIES BECAUSE OF HEALTH PROBLEMS',
       'PH040: UNMET NEED FOR MEDICAL EXAMINATION OR TREATMENT',
       'PH060: UNMET NEED FOR DENTAL EXAMINATION OR TREATMENT',
       'PD020: Replace worn-out clothes by some new (not second-hand) ones',
       'PD030: Two pairs of properly fitting shoes',
       'PD050: Get-together with friends/family (relatives) for a drink/meal at least once a month',
       'PD060: Regularly participate in a leisure activity',
       'PD070: Spend a small amount of money each week on yourself',
       'PD080: Internet connection for personal use at home',
      ]
    


    create_dummy=set(featureslist).intersection(categorical)
    data_dummy = pd.get_dummies(data, columns=create_dummy, drop_first=True)
    if dummy=='yes':
        data = data_dummy
    
    ## drop na in outcome
    if drop_na_in_y == 'yes':
        data = data[data[outcomelist]>=0]
    
    ## split dataset again in x and y
    XCol=list(data.columns)
    XCol.remove(outcomelist)
    YCol=outcomelist
    Xtouse = data[XCol]
    y= data[YCol]

    
    # scaler
    if scalertouse == "Standard":
        scaler = StandardScaler()
        X_ = scaler.fit_transform(Xtouse)
    elif scalertouse == "MinMax":
        scaler = MinMaxScaler(feature_range=(0, 1))
        X_ = scaler.fit_transform(Xtouse)
        X_.shape
    elif scalertouse == "Normalizer":
        scaler = Normalizer()
        X_ = scaler.fit_transform(Xtouse)
        X_.shape
    else: 
        X_ = Xtouse
    
    X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size=0.2)
    return X_train, X_test, y_train, y_test

## 0.2  Regression

In [None]:
# maxximize function to split and run all the models:

def regmodel (X_interested,y_interested, dummy, drop_na_in_y, scalertouse):

    X_train = generateTest (X_interested,y_interested, dummy, drop_na_in_y, scalertouse)[0]
    X_test = generateTest (X_interested,y_interested, dummy, drop_na_in_y, scalertouse)[1]
    y_train = generateTest (X_interested,y_interested, dummy, drop_na_in_y,  scalertouse)[2]
    y_test = generateTest (X_interested,y_interested, dummy, drop_na_in_y, scalertouse)[3]

    
    print("X: ", X_interested)
    print("Y: ", y_interested,"\n")
    
    # Linear Regression
    lm = LinearRegression()
    model = lm.fit(X_train,y_train)

    print("Intercept: ", lm.intercept_)
    print("Coef: ", lm.coef_)

    y_pred  = lm.predict(X_test)
    print ("linear aMSE: ", mean_absolute_error(y_test, y_pred))
    print ("linear MSE: ",mean_squared_error(y_test, y_pred))
    print ("linear R2: ",r2_score(y_test, y_pred), "\n")
    
    
    ### Regression Tree
    regr = DecisionTreeRegressor(random_state = 29)
    model = regr.fit(X_train, y_train)
    y_pred  = regr.predict(X_test)
    regr.score(X_test, y_test)
    print ("Regression Tree aMSE: ", mean_absolute_error(y_test, y_pred))
    print ("Regression Tree MSE: ",mean_squared_error(y_test, y_pred))
    print ("Regression Tree R2: ",r2_score(y_test, y_pred),"\n")
    
    ### KNeighborsRegressor
    knnr = KNeighborsRegressor(n_neighbors = 3)
    model = knnr.fit(X_train, y_train)  #fit the model
    y_pred = knnr.predict(X_test)
    print ("KNeighbores aMSE: ", mean_absolute_error(y_test, y_pred))
    print ("KNeighbores MSE: ",mean_squared_error(y_test, y_pred))
    print ("KNeighbores R2: ",r2_score(y_test, y_pred))

In [None]:
regmodel(bneeds,'PW130: TRUST IN THE POLITICAL SYSTEM', 'yes', 'yes', 'no')

## 0.4  Classification

In [None]:
# maxximize function to split and run all the models:
## y binning first
def class_bin (X_interested,y_interested, dummy, drop_na_in_y, scalertouse):

    X_train = generateTest (X_interested,y_interested, dummy, drop_na_in_y, scalertouse)[0]
    X_test = generateTest (X_interested,y_interested, dummy, drop_na_in_y,  scalertouse)[1]
    y_train = generateTest (X_interested,y_interested, dummy, drop_na_in_y,  scalertouse)[2]
    y_test = generateTest (X_interested,y_interested, dummy, drop_na_in_y,  scalertouse)[3]

    
    print("X: ", X_interested)
    print("Y: ", y_interested, "\n")
    
    # logistic regression
    lr = LogisticRegression()
    lr.fit(X_train,y_train)
    y_pred=lr.predict(X_test)
    conf=confusion_matrix(y_test, y_pred)
    #tn,fp,fn,tp = confusion_matrix(y_test, y_pred).ravel()
    #matrix = pd.DataFrame([[tp, tn, (tp+tn),round((tp+tn)/(tp+tn+fp+fn)*100,2)],[fp, fn, (fp+fn), ]], index = ["true", "false"], columns=['positive', 'negative', 'total','accuracy'])
    #print(matrix)
    print( "Logistic regression - accuracy: ", str(round(lr.score(X_test, y_test)*100,2)))
    print( "Logistic regression - precision", str(round((conf[0,0])/ (conf[1,0]+conf[0,0])*100,2)), "\n")

    # Decision Tree classyier
    dtc = DecisionTreeClassifier()
    dtc.fit(X_train, y_train)
    y_pred = dtc.predict(X_test)
    conf=confusion_matrix(y_test, y_pred)
    #tn,fp,fn,tp = confusion_matrix(y_test, y_pred).ravel()
    #matrix = pd.DataFrame([[tp, tn, (tp+tn),round((tp+tn)/(tp+tn+fp+fn)*100,2)],[fp, fn, (fp+fn), ]], index = ["true", "false"], columns=['positive', 'negative', 'total','accuracy'])
    #print(matrix)
    print( "Decision tree - accuracy  ", str(round(dtc.score(X_test, y_test)*100,2)))
    print( "Decision tree - precision", str(round((conf[0,0])/ (conf[1,0]+conf[0,0])*100,2)), "\n")  
      
    # super vector machine (takes too much time)                                          
                                                   
    # KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors = 2)  # n_neighbors means k
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    conf=confusion_matrix(y_test, y_pred)
    #tn,fp,fn,tp = confusion_matrix(y_test, y_pred).ravel()
    #matrix = pd.DataFrame([[tp, tn, (tp+tn),round((tp+tn)/(tp+tn+fp+fn)*100,2)],[fp, fn, (fp+fn), ]], index = ["true", "false"], columns=['positive', 'negative', 'total','accuracy'])
    print(conf)
    print( "KNeighborsClassifier - accuracy  ", str(round(knn.score(X_test, y_test)*100,2)))
    print( "KNeighborsClassifier - precision", str(round((conf[0,0])/ (conf[1,0]+conf[0,0])*100,2)), "\n") 

In [None]:
class_bin(background, 'PW130: TRUST IN THE POLITICAL SYSTEM', 'yes' , 'yes', 'MaxMin')

In [None]:
def tryclass_bin(xes,yli,nrbinx, dummy, drop_na_in_y, scalertouse):
    ds=reload()
    # x biningfor x in try1:
    for x in xes:
        if len(ds[x].unique())>nrbinx:
               ds[x]=pbin(x,nrbinx)
    # y binning
    for x in xes: 
        print(x, ds[x].unique())
    ds['binned_y']= pbin(yli,2)
    print(ds['binned_y'].unique())
    
    # run
    class_bin (xes,yli, dummy, drop_na_in_y, scalertouse)

mulit class

In [None]:
## target multi
# binning first
def class_mul (X_interested,y_interested, dummy, drop_na_in_y, scalertouse):
    
    X_train = generateTest (X_interested,y_interested, dummy, drop_na_in_y, scalertouse)[0]
    X_test = generateTest (X_interested,y_interested, dummy, drop_na_in_y, scalertouse)[1]
    y_train = generateTest (X_interested,y_interested, dummy, drop_na_in_y,  scalertouse)[2]
    y_test = generateTest (X_interested,y_interested, dummy, drop_na_in_y,  scalertouse)[3]

    
    print("X: ", X_interested)
    print("X: ", y_interested)
    
    #### b) DecisionTree Classify

    dtc = DecisionTreeClassifier()#class_weight='balanced')#max_features=10,
    dtc.fit(X_train, y_train)
    y_pred = dtc.predict(X_test)

    acc = dtc.score(X_test, y_test)*100
    print(f"Decision Tree Test Accuracy {round(acc, 2)}%")
 

    #### c) Support Vector Maschine
    svm = SVC(decision_function_shape='ovo')
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)
    acc = svm.score(X_test,y_test)*100
    print(f"SVM Algorithm Test Accuracy {round(acc, 2)}%")

In [None]:
# test
class_mul(background, sat[0], 'yes' , 'no', 'MaxMin')

In [None]:
# combine binnen y and run modell
def tryclass_multi(xes,yli,nrbinx, nrbiny,dummy, drop_na_in_y, scalertouse):
    ds=reload()
    # x biningfor x in try1:
    for x in xes:
        if len(ds[x].unique())>nrbinx:
               ds[x]=pbin(x,nrbinx)
    # y binning
    for x in xes: 
        print(x, ds[x].unique())
    ds['binned_y']= pbin(yli,nrbiny)
    print(ds['binned_y'].unique())
    
    # run
    class_mul(xes, 'binned_y', dummy, drop_na_in_y, scalertouse)

In [None]:
tryclass_multi(background, sat[3], 2,2,  'yes' , 'no', 'MaxMin')

## 0.4 RFE

In [None]:
def generateRFE (X_train, y_train, limit): 
    auto_model = LinearRegression()
    selector = RFE(auto_model, 5, step=1)
    model = selector.fit(X_train,y_train)
    selector.ranking_
    rfe_col = []
    for x in range(len(X_train.columns)):
        if selector.ranking_[x]<=limit:
            rfe_col.append(X_train.columns[x])
    print(len(rfe_col))
    return rfe_col

# 1. apply to models

## 1.1 model 1 "political satisfaction"

In [None]:
columns

#### a) selection of features by logic

In [None]:
ds = reload()
## selection of features by logic
try1 = ['PB140: YEAR OF BIRTH',
 'PB150: SEX',
 'PB190: MARITAL STATUS',
 'PE040: HIGHEST ISCED LEVEL ATTAINED',
 'PL031: SELF-DEFINED CURRENT ECONOMIC STATUS',
 'PH010: GENERAL HEALTH',
 'PY010G_PY020G_PY021G_PY050G: EMPLOYEE INCOME',
 'PY090G++: SOCIAL BENEFITS',
 'PD080: Internet connection for personal use at home','PW010: OVERALL LIFE SATISFACTION']

In [None]:
regmodel (try1,'PW130: TRUST IN THE POLITICAL SYSTEM','yes' , 'yes', 'Normalizer')

In [None]:
tryclass_bin(try1,'PW130: TRUST IN THE POLITICAL SYSTEM',3, 'yes' , 'yes', 'Normalizer')

In [None]:
#tryclass_multi(xes,yli,nrbinx, nrbiny)
tryclass_multi(try1,'PW130: TRUST IN THE POLITICAL SYSTEM',4,4, 'yes' , 'yes', 'Normalizer')

In [None]:
###  try2 only with prior features based on RFE

In [None]:
ds=reload()
try2 =generateRFE (ds[try1], ds['PW130: TRUST IN THE POLITICAL SYSTEM'], 2)

In [None]:
for x in range(2,6):
    tryclass_multi(try1,'PW130: TRUST IN THE POLITICAL SYSTEM',x,x, 'yes' , 'yes', 'Normalizer')


other way of binning:


ds=reload()
ds[yli]=ds[yli].apply(lambda x: 2 if (x <=2) & (x!=-1) else x)
ds[yli]=ds[yli].apply(lambda x: 4 if (x ==3) & (x==4) else x)
ds[yli]=ds[yli].apply(lambda x: 4 if (x ==3) & (x==4) else x)
ds[yli] =ds[yli].apply(lambda x: 6 if ((x ==6) | (x==7)) else x)
ds[yli].unique()
ds[yli].hist()

#### b) based on RFE- all

#### c) based on RFE- all

In [None]:
## based on RFE all columns
try3 = ['PE010: CURRENT EDUCATION ACTIVITY',
 'PL150: MANAGERIAL POSITION',
 'PD050: Get-together with friends/family (relatives) for a drink/meal at least once a month',
 'PD070: Spend a small amount of money each week on yourself',
 'PW070: FEELING CALM AND PEACEFUL',
 'PW220: PHYSICAL SECURITY']

#try3
xes = generateRFE(ds[columns], ds[yli], 8)

In [None]:
## multi clas
class_mul(xes, yli, 'yes','no','Normalizer')

RFE with Dummy-Features

In [None]:
## do rfe with dummy-features
ds=reload()
categorical= ['PB150: SEX', 'PB190: MARITAL STATUS',
       'PB200: CONSENSUAL UNION', 'PB220A: CITIZENSHIP',
       'PE010: CURRENT EDUCATION ACTIVITY',
       'PL031: SELF-DEFINED CURRENT ECONOMIC STATUS',
       'PL035: WORKED AT LEAST 1 HOUR DURING THE PREVIOUS WEEK',
       'PL051: OCCUPATION (ISCO-08 (COM))', 'PL150: MANAGERIAL POSITION',
        'PH030: LIMITATION IN ACTIVITIES BECAUSE OF HEALTH PROBLEMS',
       'PH040: UNMET NEED FOR MEDICAL EXAMINATION OR TREATMENT',
       'PH060: UNMET NEED FOR DENTAL EXAMINATION OR TREATMENT',
       'PD020: Replace worn-out clothes by some new (not second-hand) ones',
       'PD030: Two pairs of properly fitting shoes',
       'PD050: Get-together with friends/family (relatives) for a drink/meal at least once a month',
       'PD060: Regularly participate in a leisure activity',
       'PD070: Spend a small amount of money each week on yourself',
       'PD080: Internet connection for personal use at home',
      ]
#columns.remove(yli)
create_dummy=set(columns).intersection(categorical)
create_dummy

In [None]:
data_dummy = pd.get_dummies(ds[columns], columns=create_dummy, drop_first=True)
data_dummy.columns
data_dummy
generateRFE(data_dummy, ds[yli], 10)

In [None]:
try4 = generateRFE(data_dummy, ds[yli], 10)

In [None]:
X = data_dummy[try4]
y= ds[yli]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
#### b) DecisionTree Classify

dtc = DecisionTreeClassifier()#class_weight='balanced')#max_features=10,
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

acc = dtc.score(X_test, y_test)*100
print(f"Decision Tree Test Accuracy {round(acc, 2)}%")
 

#### c) Support Vector Maschine
svm = SVC(decision_function_shape='ovo')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
acc = svm.score(X_test,y_test)*100
print(f"SVM Algorithm Test Accuracy {round(acc, 2)}%")

#### <span style="color:red"> Conclusion: No Model with accuracy</span>

## 1.2 model 1 income

In [None]:
columns

#### a) selection of features by logic

In [None]:
ds = reload()
## selection of features by logic
try1 = ['PB140: YEAR OF BIRTH',
 'PB150: SEX',
 'PB190: MARITAL STATUS',
 'PE040: HIGHEST ISCED LEVEL ATTAINED',
 'PL031: SELF-DEFINED CURRENT ECONOMIC STATUS',
 'PL035: WORKED AT LEAST 1 HOUR DURING THE PREVIOUS WEEK',
 'PL051: OCCUPATION (ISCO-08 (COM))',
 'PL150: MANAGERIAL POSITION',
 'PH010: GENERAL HEALTH',
 'PW010: OVERALL LIFE SATISFACTION',
 'PW080: FEELING DOWNHEARTED OR DEPRESSED',
 'PW090: BEING HAPPY',
 'PW180: HELP FROM OTHERS',
 'PW190: TRUST IN OTHERS']

ds=ds[ds['PY010G_PY020G_PY021G_PY050G: EMPLOYEE INCOME']!=0]

In [None]:
plt.scatter(ds['PB140: YEAR OF BIRTH'],ds['PY010G_PY020G_PY021G_PY050G: EMPLOYEE INCOME'])
ds['PY010G_PY020G_PY021G_PY050G: EMPLOYEE INCOME'].describe()

In [None]:
regmodel('PB140: YEAR OF BIRTH','PY010G_PY020G_PY021G_PY050G: EMPLOYEE INCOME','yes' , 'yes', 'Normalizer')

In [None]:
tryclass_bin(try1,'PW130: TRUST IN THE POLITICAL SYSTEM',3, 'yes' , 'yes', 'Normalizer')

In [None]:
#tryclass_multi(xes,yli,nrbinx, nrbiny)
tryclass_multi(try1,'PW130: TRUST IN THE POLITICAL SYSTEM',4,4, 'yes' , 'yes', 'Normalizer')

In [None]:
###  try2 only with prior features based on RFE

In [None]:
ds=reload()
try2 =generateRFE (ds[try1], ds['PW130: TRUST IN THE POLITICAL SYSTEM'], 2)

In [None]:
for x in range(2,6):
    tryclass_multi(try1,'PW130: TRUST IN THE POLITICAL SYSTEM',x,x, 'yes' , 'yes', 'Normalizer')


other way of binning:


ds=reload()
ds[yli]=ds[yli].apply(lambda x: 2 if (x <=2) & (x!=-1) else x)
ds[yli]=ds[yli].apply(lambda x: 4 if (x ==3) & (x==4) else x)
ds[yli]=ds[yli].apply(lambda x: 4 if (x ==3) & (x==4) else x)
ds[yli] =ds[yli].apply(lambda x: 6 if ((x ==6) | (x==7)) else x)
ds[yli].unique()
ds[yli].hist()

#### b) based on RFE- all

#### c) based on RFE- all

In [None]:
## based on RFE all columns
try3 = ['PE010: CURRENT EDUCATION ACTIVITY',
 'PL150: MANAGERIAL POSITION',
 'PD050: Get-together with friends/family (relatives) for a drink/meal at least once a month',
 'PD070: Spend a small amount of money each week on yourself',
 'PW070: FEELING CALM AND PEACEFUL',
 'PW220: PHYSICAL SECURITY']

#try3
xes = generateRFE(ds[columns], ds[yli], 8)

In [None]:
## multi clas
class_mul(xes, yli, 'yes','no','Normalizer')

RFE with Dummy-Features

In [None]:
## do rfe with dummy-features
ds=reload()
categorical= ['PB150: SEX', 'PB190: MARITAL STATUS',
       'PB200: CONSENSUAL UNION', 'PB220A: CITIZENSHIP',
       'PE010: CURRENT EDUCATION ACTIVITY',
       'PL031: SELF-DEFINED CURRENT ECONOMIC STATUS',
       'PL035: WORKED AT LEAST 1 HOUR DURING THE PREVIOUS WEEK',
       'PL051: OCCUPATION (ISCO-08 (COM))', 'PL150: MANAGERIAL POSITION',
        'PH030: LIMITATION IN ACTIVITIES BECAUSE OF HEALTH PROBLEMS',
       'PH040: UNMET NEED FOR MEDICAL EXAMINATION OR TREATMENT',
       'PH060: UNMET NEED FOR DENTAL EXAMINATION OR TREATMENT',
       'PD020: Replace worn-out clothes by some new (not second-hand) ones',
       'PD030: Two pairs of properly fitting shoes',
       'PD050: Get-together with friends/family (relatives) for a drink/meal at least once a month',
       'PD060: Regularly participate in a leisure activity',
       'PD070: Spend a small amount of money each week on yourself',
       'PD080: Internet connection for personal use at home',
      ]
#columns.remove(yli)
create_dummy=set(columns).intersection(categorical)
create_dummy

In [None]:
data_dummy = pd.get_dummies(ds[columns], columns=create_dummy, drop_first=True)
data_dummy.columns
data_dummy
generateRFE(data_dummy, ds[yli], 10)

In [None]:
try4 = generateRFE(data_dummy, ds[yli], 10)

In [None]:
X = data_dummy[try4]
y= ds[yli]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
#### b) DecisionTree Classify

dtc = DecisionTreeClassifier()#class_weight='balanced')#max_features=10,
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

acc = dtc.score(X_test, y_test)*100
print(f"Decision Tree Test Accuracy {round(acc, 2)}%")
 

#### c) Support Vector Maschine
svm = SVC(decision_function_shape='ovo')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
acc = svm.score(X_test,y_test)*100
print(f"SVM Algorithm Test Accuracy {round(acc, 2)}%")

#### <span style="color:red"> Conclusion: No Model with accuracy</span>

# Stepwise as Backup
# 1 Data Selection and Preperation

#### 1.1. define features and Y

In [None]:
outcome = ds['PW010: OVERALL LIFE SATISFACTION']
features = ds[background]
outcomelist='PW010: OVERALL LIFE SATISFACTION'
featureslist=background

In [None]:
data =pd.merge(features, outcome, left_index=True, right_index=True)

In [None]:
data.head()

#### 1.2 make lefts skewed X categorical

In [None]:
shouldbecategorical= main_list = list(set(data.columns)-set(categorical)-set([outcomelist]))

#### 1.3 create dummies

In [None]:
categorical= ['PB150: SEX', 'PB190: MARITAL STATUS',
       'PB200: CONSENSUAL UNION', 'PB220A: CITIZENSHIP',
       'PE010: CURRENT EDUCATION ACTIVITY',
       'PL031: SELF-DEFINED CURRENT ECONOMIC STATUS',
       'PL035: WORKED AT LEAST 1 HOUR DURING THE PREVIOUS WEEK',
       'PL051: OCCUPATION (ISCO-08 (COM))', 'PL150: MANAGERIAL POSITION',
        'PH030: LIMITATION IN ACTIVITIES BECAUSE OF HEALTH PROBLEMS',
       'PH040: UNMET NEED FOR MEDICAL EXAMINATION OR TREATMENT',
       'PH060: UNMET NEED FOR DENTAL EXAMINATION OR TREATMENT',
       'PD020: Replace worn-out clothes by some new (not second-hand) ones',
       'PD030: Two pairs of properly fitting shoes',
       'PD050: Get-together with friends/family (relatives) for a drink/meal at least once a month',
       'PD060: Regularly participate in a leisure activity',
       'PD070: Spend a small amount of money each week on yourself',
       'PD080: Internet connection for personal use at home',
      ]

create_dummy=set(featureslist).intersection(categorical)

data_dummy = pd.get_dummies(data, columns=create_dummy, drop_first=True)
data_dummy.columns

In [None]:
# use dummy:
data = data_dummy

# not use dummy:
#data = data =pd.merge(features, outcome, left_index=True, right_index=True)

# 2. Regression

#### <font color=red>here decide to keep or drop nan in the outcome: </font>

In [None]:
## drop nan
#data = data[data[outcomelist]>=0]

#reset 
#data = data =pd.merge(features, outcome, left_index=True, right_index=True)

In [None]:
XCol=list(data.columns)
XCol.remove(outcomelist)
YCol=outcomelist

In [None]:
X = data[XCol]
y= data[YCol]

•	imbalance checking (undersampling, oversamplingML umbalanced data lib)

•	Split --> X_tr, y_tr, / X_ts,  y_ts

•	Try few models (with different algorithms) (<-- pipeline)

    –	Initialization

    –	Fit / predict

    –	Evaluation (overfitting / underfitting)

•	Choose best option based on requirements

•	Optimize the model

    –	Cross -validation

    –	Try few models (with same way to approach problem)

        o	Init

        o	Fit.(predict)

        o	Evaluation

    –	Hpyertuning


### 2.1 Test for imbalancing in Y

In [None]:
YCol

In [None]:
y.hist()

## it is very imbalanced  --> bin the values bellow 5 and change *dont know" to other value

In [None]:
#y.loc[(y['PW130: TRUST IN THE POLITICAL SYSTEM']==-1), 'PW130: TRUST IN THE POLITICAL SYSTEM']= X

#### <font color=red>here decide how to bin y: </font>

In [None]:
y_binned=y.apply(lambda x: 4.5 if (x <=5) & (x!=-1) else x)
y_binned =y_binned.apply(lambda x: 6.5 if ((x ==6) | (x==7)) else x)
y_binned.unique()

In [None]:
y_binned.hist()


In [None]:
### just to check correlation
dset = pd.merge(X, y, left_index=True, right_index=True)


plt.figure(figsize =(10,5))
corr = dset.corr()
mask = np.triu(np.ones_like(corr, dtype=np.bool))
sns.heatmap(corr, cmap='coolwarm', annot = False, linewidth=0.5, mask=mask)

## 2.2. Scale

#### <font color=red>here decide to scale x or not and if yes which one: </font>

In [None]:
### scaler
scaler = StandardScaler()
X_std = scaler.fit_transform(X)
X_std.shape
X.columns

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
X_minmax = scaler.fit_transform(X)
X_minmax.shape


## 2.3 Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_minmax, y, test_size=0.2)

## 2.4 run different models

#### a) Prepare Model 

In [None]:
## prepare interaction term
for d in range(2):
    interaction = PolynomialFeatures(degree = d, include_bias = False, interaction_only = True)
    X_inter = interaction.fit_transform(X_train)

#### b)  Linear Regression Model

In [None]:

lm = LinearRegression()
model = lm.fit(X_train,y_train)

print("Intercept: ", lm.intercept_)
print("Coef: ", lm.coef_)

y_pred  = lm.predict(X_test)


In [None]:
mean_absolute_error(y_test, y_pred)

In [None]:
mean_squared_error(y_test, y_pred)

In [None]:
r2_score(y_test, y_pred)

#### c)  Taylor (pipeline)

#for k in range(1,8):
poly_model = make_pipeline (StandardScaler(), PolynomialFeatures(degree=3, include_bias = False, interaction_only = True), LinearRegression())

model = poly_model.fit(X_train, y_train)

    
print(poly_model.score(X_test, y_test))

y_pred  = poly_model.predict(X_test)


In [None]:
mean_absolute_error(y_test, y_pred)

In [None]:
mean_squared_error(y_test, y_pred)

In [None]:
r2_score(y_test, y_pred)

#### c)  Regression Tree

In [None]:
regr = DecisionTreeRegressor(random_state = 29)

model = regr.fit(X_train, y_train)

y_pred  = regr.predict(X_test)
regr.score(X_test, y_test)

In [None]:
#r = export_text(regr, feature_names=list(X.columns))
#print(r)

In [None]:
mean_absolute_error(y_test, y_pred)

In [None]:
mean_squared_error(y_test, y_pred)

In [None]:
r2_score(y_test, y_pred)


In the rare cases you get a negative r squared value, you should probably rethink your regression analysis, especially if you are forcing an intercept.

#### d)  KNeighborsRegressor

In [None]:
knnr = KNeighborsRegressor(n_neighbors = 3)

model = knnr.fit(X_train, y_train)  #fit the model
y_pred = knnr.predict(X_test)


In [None]:
mean_absolute_error(y_test, y_pred)

In [None]:
mean_squared_error(y_test, y_pred)

In [None]:
r2_score(y_test, y_pred)

#### <font color=red>huge means squared_error and negative r squared for binned and unbinned y
     also failed with drop nan, but way better!
    also failed with feature elimination

</font>

# 3. optimize recursive feature elimination (RFE)

In [None]:
## recursive feature elimination.
# initialize an RFE model using the `auto_model` linear regression model. Set `n_features_to_select=3`

In [None]:
generateTest(columns[:-20],outcomelist, 'yes', 'yes', 'yes', 'no')[0]

Fit the model and print the ranking

In [None]:
generateRFE(generateTest(columns[:-20],outcomelist, 'yes', 'yes', 'yes', 'no')[0],generateTest(columns[:-20],outcomelist, 'yes', 'yes', 'yes', 'no')[2])

In [None]:
# Your code here:
auto_model = LinearRegression()
selector = RFE(auto_model, 5, step=1)
model = selector.fit(generateTest(columns[:-20],outcomelist, 'yes', 'yes', 'yes', 'no')[0],generateTest(columns[:-20],outcomelist, 'yes', 'yes', 'yes', 'no')[2])

In [None]:
# Your code here:
selector.ranking_
#selector.support_

In [None]:
Xhere = generateTest(columns[:-20],outcomelist, 'yes', 'yes', 'yes', 'no')[0]
rfe_col = []
for x in range(len(Xhere.columns)):
    if selector.ranking_[x]<=10:
        rfe_col.append(Xhere.columns[x])
print(len(rfe_col))
rfe_col

In [None]:
X=data[columns]
X.columns

In [None]:
test = X
test['YY'] = ytogo
plt.figure(figsize =(10,10))
corr = X.corr()
mask = np.triu(np.ones_like(corr, dtype=np.bool))
sns.heatmap(corr, cmap='coolwarm', annot = False, linewidth=0.5, mask=mask)
## --> run 2.2 split and 2.3 models again

## 2.3 PCA

In [None]:
X = data[XCol]
X.shape

In [None]:
# Create a PCA that will retain 99% of variance
pca = PCA(n_components=0.99, whiten=True)
# ica = FastI-CA(n_components=0.99, whit-en=True)

# Conduct PCA
features_pca = pca.fit_transform(X)

In [None]:
X= features_pca
X.shape
## --> run 2.2 split and 2.3 models again

## 2. Classification

### 2.1.1 prepare data

In [None]:
# change outcome variable to a 2 classification variable:
# drop -1
# based on ds_1
data = ds_1

data =data[data['PW130: TRUST IN THE POLITICAL SYSTEM']!=-1]
data.shape
ds_1.shape[0]-data.shape[0]

In [None]:
data['Y_class']= data['PW130: TRUST IN THE POLITICAL SYSTEM'].apply(lambda x: 1 if x>=8 else 0)

In [None]:
data['Y_class'].hist()
data['Y_class'].value_counts()
## --> balanced more or less

In [None]:
XCol=data.columns[:-2]
YCol=data.columns[-1]

print(len(XCol))
print(YCol)

In [None]:
X = data[XCol]
y= data[YCol]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train.shape
X_test.shape


#### a) Logistic Regression

In [None]:
lr = LogisticRegression()
lr.fit(X_train,y_train)


y_pred=lr.predict(X_test)

In [None]:
acc = lr.score(X_test,y_test)
acc

In [None]:
conf=confusion_matrix(y_test, y_pred)

print( "accuracy: "  + str(round((conf[1,1]+conf[0,0])/ conf.sum()*100,2)))
print( "precision: " +str(round((conf[0,0])/ (conf[1,0]+conf[0,0])*100,2)))

conf

tn,fp,fn,tp = confusion_matrix(y_test, y_pred).flatten()
### it predicts all as 1...

#### b) DecisionTree Classify

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

In [None]:
acc = dtc.score(X_test, y_test)*100
print(f"Decision Tree Test Accuracy {round(acc, 2)}%")

In [None]:
conf=confusion_matrix(y_test, y_pred)

print( "accuracy: "  + str(round((conf[1,1]+conf[0,0])/ conf.sum()*100,2)))
print( "precision: " +str(round((conf[0,0])/ (conf[1,0]+conf[0,0])*100,2)))

conf

In [None]:
## overfitting!!!

#### c) Support Vector Maschine

In [None]:
svm = SVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

In [None]:
acc = svm.score(X_test,y_test)*100
print(f"SVM Algorithm Test Accuracy {round(acc, 2)}%")

In [None]:
conf=confusion_matrix(y_test, y_pred)

print( "accuracy: "  + str(round((conf[1,1]+conf[0,0])/ conf.sum()*100,2)))
print( "precision: " +str(round((conf[0,0])/ (conf[1,0]+conf[0,0])*100,2)))

#### d) K-Nearest Neighbour

In [None]:
knn = KNeighborsClassifier(n_neighbors = 2)  # n_neighbors means k
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [None]:
acc = knn.score(X_test, y_test)*100
acc

In [None]:
conf=confusion_matrix(y_test, y_pred)

print( "accuracy: "  + str(round((conf[1,1]+conf[0,0])/ conf.sum()*100,2)))
print( "precision: " +str(round((conf[0,0])/ (conf[1,0]+conf[0,0])*100,2)))
