In [None]:
## load required python packages
import pandas as pd
import numpy as np

! pip install pygam
from pygam import LinearGAM, LogisticGAM

import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("survey_preprocessed_0127.csv").iloc[:,1:]
df.head()

In [None]:
target_variables = []
for feature in list(df.columns):
  if feature.split('_')[0] == 'm':
    target_variables.append(feature)
target_variables

In [None]:
input_features = ['method','isPractical','majorForCourse','fstMajor','gender','enterDate']
for col_name in df.columns:
  if col_name.split("_")[0] == 'cop':
    input_features.append(col_name)
  elif col_name.split("_")[0] == 'env':
    input_features.append(col_name)

input_features

In [None]:
df.replace(['Disagree'.lower(), 'Strongly Agree'.lower(), 'Strongly disagree'.lower(), 'Agree'.lower(),
       'Neutral'.lower()],[-1,1,-1,1,0],inplace=True)

df.replace('Strongly agree'.lower(),1,inplace=True)
df.replace(['Not at all true'.lower(), 'A little true'.lower(), 'None'.lower(), 'Sometimes true'.lower(),
       'Mostly true'.lower(), 'Very true'.lower()],[-1,-1,0,1,1,2],inplace=True)
df.replace(['Yes','No'],[1,0],inplace = True)
df.replace(['Female','Male'],[1,0],inplace = True)

In [None]:
df.head()

In [None]:
import copy
from sklearn.metrics import accuracy_score,f1_score
from sklearn.model_selection import train_test_split
split_ratio = 0.2
seed = 1
target = 'm_sameMethod'
num_cols = df[input_features]._get_numeric_data().columns
categorical_cols = set(input_features)-set(list(num_cols))
one_hot_encoded_data = pd.get_dummies(df[input_features], columns = list(categorical_cols))
X,y = one_hot_encoded_data,df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=seed)

In [None]:
X_train.head()

In [None]:
X_train.dtypes

In [None]:
from imblearn.over_sampling import SMOTE
combi = input_features
num_cols = df[combi]._get_numeric_data().columns
categorical_cols = set(combi)-set(list(num_cols))
one_hot_encoded_data = pd.get_dummies(df[combi], columns = list(categorical_cols))
X,y = one_hot_encoded_data,copy.deepcopy(df[target])
# y.replace(-1,-2,inplace = True)
# y.replace(1,2,inplace = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=seed)
X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)

In [None]:
X.head()

In [None]:
y_resampled.value_counts()#train y distribution

In [None]:
y0_resampled = (y_resampled == 0)*1
gam0 = LogisticGAM(n_splines = 20).fit(X_resampled,y0_resampled)
log_gam0 = gam0.gridsearch(X_resampled,y0_resampled)

In [None]:
log_gam0.accuracy(X_resampled,y0_resampled)

In [None]:
y1_resampled = (y_resampled == 1)*1
gam1 = LogisticGAM(n_splines = 10).fit(X_resampled,y1_resampled)
log_gam1 = gam1.gridsearch(X_resampled,y1_resampled)

In [None]:
log_gam1.accuracy(X_resampled,y1_resampled)

In [None]:
y_neg_resampled = (y_resampled == -1)*1
gam_neg = LogisticGAM(n_splines = 20).fit(X_resampled,y_neg_resampled)
log_gam_neg = gam_neg.gridsearch(X_resampled,y_neg_resampled)

In [None]:
log_gam_neg.accuracy(X_resampled,y_neg_resampled)

In [None]:
proba = []
proba.append(log_gam_neg.predict_proba(X_resampled))
proba.append(log_gam0.predict_proba(X_resampled))
proba.append(log_gam1.predict_proba(X_resampled))

In [None]:
preds = np.argmax(np.array(proba),axis = 0)
preds[preds == 0] = -1
preds[preds == 1] = 0
preds[preds == 2] = 1

In [None]:
from sklearn.metrics import accuracy_score,f1_score
accuracy_score(y_resampled,preds)#Train acc

In [None]:
proba = []
proba.append(log_gam_neg.predict_proba(X_test))
proba.append(log_gam0.predict_proba(X_test))
proba.append(log_gam1.predict_proba(X_test))

In [None]:
preds = np.argmax(np.array(proba),axis = 0)
preds[preds == 0] = -1
preds[preds == 1] = 0
preds[preds == 2] = 1

In [None]:
from sklearn.metrics import accuracy_score,f1_score
accuracy_score(y_test,preds)#Train acc

In [None]:
from itertools import combinations
best_f1 = 0.
best_acc = 0.

for n_sample in range(4,8):
    print("best test acc:",best_acc,"best f1 score:",best_f1)
    for combi in list(combinations(input_features,n_sample)):
        combi = list(combi)
        num_cols = df[combi]._get_numeric_data().columns
        categorical_cols = set(combi)-set(list(num_cols))
        one_hot_encoded_data = pd.get_dummies(df[combi], columns = list(categorical_cols))
        X,y = one_hot_encoded_data,copy.deepcopy(df[target])
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=seed)
        X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)
        
        for n in range(3,20):
            try:
                print("n_splines:",n)

                y0_resampled = (y_resampled == 0)*1
                gam0 = LogisticGAM(n_splines = n).fit(X_resampled,y0_resampled)
                log_gam0 = gam0.gridsearch(X_resampled,y0_resampled)

                y1_resampled = (y_resampled == 1)*1
                gam1 = LogisticGAM(n_splines = n).fit(X_resampled,y1_resampled)
                log_gam1 = gam1.gridsearch(X_resampled,y1_resampled)

                y_neg_resampled = (y_resampled == -1)*1
                gam_neg = LogisticGAM(n_splines = n).fit(X_resampled,y_neg_resampled)
                log_gam_neg = gam_neg.gridsearch(X_resampled,y_neg_resampled)

                proba = []
                proba.append(log_gam_neg.predict_proba(X_resampled))
                proba.append(log_gam0.predict_proba(X_resampled))
                proba.append(log_gam1.predict_proba(X_resampled))
                preds = np.argmax(np.array(proba),axis = 0)
                preds[preds == 0] = -1
                preds[preds == 1] = 0
                preds[preds == 2] = 1

                train_acc = accuracy_score(y_resampled,preds)#Train acc
                proba = []
                proba.append(log_gam_neg.predict_proba(X_test))
                proba.append(log_gam0.predict_proba(X_test))
                proba.append(log_gam1.predict_proba(X_test))
                preds = np.argmax(np.array(proba),axis = 0)
                preds[preds == 0] = -1
                preds[preds == 1] = 0
                preds[preds == 2] = 1

                test_acc = accuracy_score(y_test,preds)#Train acc
                f1_test = f1_score(y_test,preds,average = 'weighted')#Train acc
                print("train acc:",train_acc,"test acc:",test_acc,"f1 score:",f1_test)
                if test_acc > best_acc:
                    best_acc = test_acc
                    best_gam0 = log_gam0
                    best_gam1 = log_gam1
                    best_gam_neg = log_gam_neg
                    best_features = combi
                    best_n_splines = n
                
                if f1_test>best_f1:
                    best_f1 = f1_test
                    best_f1_gam0 = log_gam0
                    best_f1_gam1 = log_gam1
                    best_f1_gam_neg = log_gam_neg
                    best_f1_features = combi
                    best_f1_n_splines = n
                

            except Exception as e:
                print("except:",e)
                continue

In [None]:
best_features

In [None]:
target

In [None]:
best_n_splines

In [None]:
best_acc

In [None]:
best_f1