In [18]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from collections import defaultdict

In [19]:
df = pd.read_csv('../data/News_pro.csv')
df = df.drop(columns=['IDLink', 'Topic', 'Facebook','GooglePlus','LinkedIn'])
df = df.drop(columns=['Facebook_pro','GooglePlus_pro','LinkedIn_pro'])

df = df.set_index('BestPlat')
df = df.drop([0,5], axis=0)
df.reset_index(inplace=True)

X = df.drop(columns=['BestPlat'],inplace=False)
y = df['BestPlat']

X = X.values 
y = y.values

In [11]:
def ML_pipeline_kfold_RF(X,y,random_state,n_folds=5):
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state, stratify = y)
    kf = StratifiedKFold(n_splits=n_folds, shuffle=True,random_state=random_state)

    CV_scores = []
    test_scores = []

    for train_index, CV_index in kf.split(X_other,y_other):
        X_train, X_CV = X_other[train_index], X_other[CV_index]
        y_train, y_CV = y_other[train_index], y_other[CV_index]

        # tune lasso hyper-parameter, alpha
        max_depths = list(range(2,15,5))
        min_samples_splits = [0.05, 0.1, 0.15, 0.2, 0.25]
        train_score = defaultdict(lambda: 0)
        CV_score = defaultdict(lambda: 0)
        regs = defaultdict(lambda: RandomForestClassifier(n_estimators=100, max_depth=2, min_samples_split=2, random_state=4))
        for i,depth in enumerate(max_depths):
            for j,num_samp in enumerate(min_samples_splits):
                reg = RandomForestClassifier(n_estimators=100, max_depth=depth, min_samples_split=num_samp, random_state=4)
                reg.fit(X_train, y_train)
                train_score[(i,j)] = accuracy_score(y_train, reg.predict(X_train))
                CV_score[(i,j)] = accuracy_score(y_CV, reg.predict(X_CV))
                regs[(i,j)] = reg
        # find the best alpha in this fold
        best_CV_combo = max(CV_score, key=CV_score.get)
        best_max_depths = max_depths[best_CV_combo[0]]
        best_min_samples_splits = min_samples_splits[best_CV_combo[1]]
        # grab the best model
        reg = regs[best_CV_combo]
        CV_scores.append(np.max(CV_score))
        # calculate test score using thee best model
        test_scores.append(accuracy_score(y_test, reg.predict(X_test)))
    
    print("best max_depths is ", best_max_depths)
    print("best min sample splits is ", best_min_samples_splits)
    return CV_scores, test_scores

s = time.time()
test_scores = [] 
for i in range(1): 
    grid, test_score = ML_pipeline_kfold_RF(X,y,i * 610, 5) 
    test_scores.append(test_score)
e = time.time()
print('test Score:', np.around(np.mean(test_scores),2), "+/-", np.around(np.std(test_scores),2))
t = e-s
print('time:', t)

best max_depths is  12
best min sample splits is  0.05
test Score: 0.67 +/- 0.0
time: 240.16114354133606


In [20]:
def ML_pipeline_kfold_RF(X,y,random_state,n_folds=5):
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state, stratify = y)
    kf = StratifiedKFold(n_splits=n_folds, shuffle=True,random_state=random_state)

    CV_scores = []
    test_scores = []

    for train_index, CV_index in kf.split(X_other,y_other):
        X_train, X_CV = X_other[train_index], X_other[CV_index]
        y_train, y_CV = y_other[train_index], y_other[CV_index]

        # tune lasso hyper-parameter, alpha
        max_depths = list(range(2,15,3))
        min_samples_splits = [0.05, 0.1, 0.15, 0.2, 0.25]
        train_score = defaultdict(lambda: 0)
        CV_score = defaultdict(lambda: 0)
        regs = defaultdict(lambda: RandomForestClassifier(n_estimators=100, max_depth=2, min_samples_split=2, random_state=4))
        for i,depth in enumerate(max_depths):
            for j,num_samp in enumerate(min_samples_splits):
                reg = RandomForestClassifier(n_estimators=100, max_depth=depth, min_samples_split=num_samp, random_state=4)
                reg.fit(X_train, y_train)
                train_score[(i,j)] = accuracy_score(y_train, reg.predict(X_train))
                CV_score[(i,j)] = accuracy_score(y_CV, reg.predict(X_CV))
                regs[(i,j)] = reg
        # find the best alpha in this fold
        best_CV_combo = max(CV_score, key=CV_score.get)
        best_max_depths = max_depths[best_CV_combo[0]]
        best_min_samples_splits = min_samples_splits[best_CV_combo[1]]
        # grab the best model
        reg = regs[best_CV_combo]
        CV_scores.append(np.max(CV_score))
        # calculate test score using thee best model
        test_scores.append(accuracy_score(y_test, reg.predict(X_test)))
    
    print("best max_depths is ", best_max_depths)
    print("best min sample splits is ", best_min_samples_splits)
    return CV_scores, test_scores

s = time.time()
test_scores = [] 
for i in range(3): 
    grid, test_score = ML_pipeline_kfold_RF(X,y,i * 610, 5) 
    test_scores.append(test_score)
e = time.time()
print('test Score:', np.around(np.mean(test_scores),6), "+/-", np.around(np.std(test_scores),6))
t = e-s
print('time:', t)

best max_depths is  5
best min sample splits is  0.05
best max_depths is  5
best min sample splits is  0.05
best max_depths is  8
best min sample splits is  0.05
test Score: 0.670651 +/- 0.000601
time: 1116.4479389190674


In [10]:
# max_depths = 5
# min_samples_splits = 5
# i = 3