In [49]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt
import datetime as dt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [66]:
df = pd.read_csv("../data/record_df.csv")

In [67]:
drop = ["Unnamed: 0","IDLink","Source","PublishDate"]
dfn = df.drop(drop,1)

logmarker = ["sum-tpop-", "sum-tavg-","POPU","sum-popu","last-popu"]
for i in dfn.columns:
    for j in logmarker:
        if j in i:
            dfn[i] = np.log(2+dfn[i].values)
            
dfn.shape

(121049, 37)

In [73]:
dfn = dfn[dfn["POPU"]>5]
print(dfn.shape)
X, y = dfn.drop(['POPU'],1).values,dfn['POPU'].values

(3412, 37)


In [76]:
def ML_pipeline_kfold(X,y,random_state,n_folds):
    # split the data
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state)
    CV_scores = []
    test_scores = []
    # k folds - each fold will give us a CV and a test score
    kf = KFold(n_splits=n_folds,shuffle=True,random_state=random_state)
    
    for train_index, CV_index in kf.split(X_other,y_other):

        X_train, X_CV = X_other[train_index], X_other[CV_index]
        y_train, y_CV = y_other[train_index], y_other[CV_index]
        # preprocessing
        mmx_ft = np.arange(3,36)
        ohe_ft = [0]
        kpt_ft = [1,2]
        
        scaler = MinMaxScaler()
        X_train_mmx = scaler.fit_transform(X_train[:,mmx_ft])
        X_c_mmx = scaler.transform(X_CV[:,mmx_ft])
        X_t_mmx = scaler.transform(X_test[:,mmx_ft])
        
        ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")
        X_train_ohe = ohe.fit_transform(X_train[:,ohe_ft])
        X_c_ohe = ohe.transform(X_CV[:,ohe_ft])
        X_t_ohe = ohe.transform(X_test[:,ohe_ft])
        
        X_train = np.concatenate((X_train_mmx, X_train_ohe, X_train[:,kpt_ft]),axis=1)
        X_c = np.concatenate((X_c_mmx, X_c_ohe, X_CV[:,kpt_ft]),axis=1)
        X_t = np.concatenate((X_t_mmx, X_t_ohe, X_test[:,kpt_ft]),axis=1)
        
        # tune ridge hyper-parameter, alpha
        mds = np.arange(3,5)
        train_score = []
        CV_score = []
        regs = []
        for md in mds:
            reg = RandomForestRegressor(max_depth = md, n_estimators = 100)
            reg.fit(X_train,y_train)
            train_score.append(mean_squared_error(y_train,reg.predict(X_train)))
            CV_score.append(mean_squared_error(y_CV,reg.predict(X_c)))
            regs.append(reg)
        # find the best alpha in this fold
        CV_score = np.array(CV_score)
        best_c = mds[np.argmin(CV_score)]
        # grab the best model
        reg = regs[np.argmin(CV_score)]
        CV_scores.append(np.min(CV_score))
        # calculate test score using thee best model
        test_scores.append(mean_squared_error(y_test,reg.predict(X_t)))
        print(reg.feature_importances)
#         plt.figure()
#         plt.scatter(y_test,reg.predict(X_t))
    return CV_scores,test_scores, best_c

#ML_pipeline_kfold(dfn.drop(['POPU'],1).values, dfn['POPU'].values,42,5)

In [None]:
test_scores = []
for rs in range(1,11):
    cv_score, test_score, best_c = ML_pipeline_kfold(X, y ,rs * 42, 5)
    test_scores.append(test_score)
    print("best_maxdepth is : ", best_c, "avg_MSE is : ", round(np.mean(test_score),2))
print('MSE :',np.around(np.mean(test_scores),2),'+/-',np.around(np.std(test_scores),2))

best_maxdepth is :  4 avg_MSE is :  0.37
best_maxdepth is :  4 avg_MSE is :  0.38
best_maxdepth is :  4 avg_MSE is :  0.38
best_maxdepth is :  4 avg_MSE is :  0.39
best_maxdepth is :  4 avg_MSE is :  0.41
best_maxdepth is :  4 avg_MSE is :  0.37
best_maxdepth is :  4 avg_MSE is :  0.38
best_maxdepth is :  4 avg_MSE is :  0.36


In [71]:
np.var(y-np.mean(y))

1.9570162652745657