In [12]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from collections import defaultdict

In [13]:
df = pd.read_csv('../data/News_pro.csv')
df = df.drop(columns=['IDLink', 'Topic', 'Facebook','GooglePlus','LinkedIn'])
df = df.drop(columns=['Facebook_pro','GooglePlus_pro','LinkedIn_pro'])

df = df.set_index('BestPlat')
df = df.drop([0,5], axis=0)
df.reset_index(inplace=True)

X = df.drop(columns=['BestPlat'],inplace=False)
y = df['BestPlat']

X = X.values 
y = y.values

In [16]:
def ML_pipeline_kfold_LR(X,y,random_state,n_folds=5,l='l2'):
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state, stratify = y)
    kf = StratifiedKFold(n_splits=n_folds, shuffle=True,random_state=random_state)

    CV_scores = []
    test_scores = []

    for train_index, CV_index in kf.split(X_other,y_other):
        X_train, X_CV = X_other[train_index], X_other[CV_index]
        y_train, y_CV = y_other[train_index], y_other[CV_index]

        # tune lasso hyper-parameter, alpha
        alpha = np.logspace(-5,5,10)
        train_score = []
        CV_score = []
        regs = []
        for a in alpha:
            # print("a",a)
            reg = LogisticRegression(penalty=l, C=1/a, solver="saga", max_iter=10000, multi_class="multinomial")
            reg.fit(X_train, y_train)
            train_score.append(accuracy_score(y_train, reg.predict(X_train)))
            CV_score.append(accuracy_score(y_CV, reg.predict(X_CV)))
            regs.append(reg)
            # print('end')
        # find the best alpha in this fold
        best_alpha = alpha[np.argmax(CV_score)]
        # grab the best model
        reg = regs[np.argmax(CV_score)]
        CV_scores.append(np.max(CV_score))
        # calculate test score using thee best model
        test_scores.append(accuracy_score(y_test, reg.predict(X_test)))
        
    print("best alpha is ", best_alpha)
    return CV_scores, test_scores

for l in ['l1','l2']:
    print("n_folds=5,alpha = np.logspace(-5,5,10), i in range(3)",l)
    s = time.time()
    test_scores = [] 
    for i in range(3): 
        grid, test_score = ML_pipeline_kfold_LR(X,y,i * 610, 5, l) 
        print(l)
        test_scores.append(test_score) 
    e = time.time()
    t = e-s
    print('time:', t)
    print('test Score:', np.around(np.mean(test_scores),6), "+/-", np.around(np.std(test_scores),6)) 

n_folds=5,alpha = np.logspace(-5,5,10), i in range(3) l1
best alpha is  3.593813663804626
l1
best alpha is  0.2782559402207126
l1
best alpha is  3.593813663804626
l1
time: 688.4462966918945
test Score: 0.656474 +/- 0.000838
n_folds=5,alpha = np.logspace(-5,5,10), i in range(3) l2
best alpha is  0.2782559402207126
l2
best alpha is  0.021544346900318846
l2
best alpha is  0.2782559402207126
l2
time: 688.17427277565
test Score: 0.656382 +/- 0.000763
