In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('../data/News_pro.csv')
print(df.head(2))

    IDLink    Topic  PublishDate  SentimentTitle  SentimentHeadline  Facebook  \
0  99248.0    obama     0.000000        0.000000          -0.053300        -1   
1  10423.0  economy     0.453492        0.208333          -0.156386        -1   

   GooglePlus  LinkedIn  Facebook_pro  GooglePlus_pro  LinkedIn_pro  \
0          -1        -1           0.0             0.0           0.0   
1          -1        -1           0.0             0.0           0.0   

   x0_economy  x0_microsoft  x0_obama  x0_palestine  BestPlat  
0         0.0           0.0       1.0           0.0         0  
1         1.0           0.0       0.0           0.0         0  


In [4]:
# drop IDLink and Topic
# drop Facebook GooglePlus and LinkedIn
# maybe I can even drop _pro for they have been truned into BestPlat
df = df.drop(columns=['IDLink', 'Topic', 'Facebook','GooglePlus','LinkedIn'])
df = df.drop(columns=['Facebook_pro','GooglePlus_pro','LinkedIn_pro'])
print(df.head(10))
df_origin = df.copy()

   PublishDate  SentimentTitle  SentimentHeadline  x0_economy  x0_microsoft  \
0     0.000000        0.000000          -0.053300         0.0           0.0   
1     0.453492        0.208333          -0.156386         1.0           0.0   
2     0.688586       -0.425210           0.139754         1.0           0.0   
3     0.905065        0.000000           0.026064         1.0           0.0   
4     0.905066        0.000000           0.141084         1.0           0.0   
5     0.905067       -0.075378           0.036773         0.0           1.0   
6     0.975033        0.000000          -0.005906         0.0           0.0   
7     0.905071        0.083333           0.103003         0.0           0.0   
8     0.905075       -0.173925          -0.050185         0.0           0.0   
9     0.905077       -0.059536          -0.081715         0.0           1.0   

   x0_obama  x0_palestine  BestPlat  
0       1.0           0.0         0  
1       0.0           0.0         0  
2       0.0     

In [4]:
# Thus, we have
# x: PublishDate, SentimentTitle, SentimentHeadline, Topic(*4)
# y: BestPlat

In [5]:
df.replace(to_replace=' ?',value = np.nan, inplace=True)

print(df.shape)
print(df.isnull().sum(axis=0)/df.shape[0])
print(sum(df.isnull().sum(axis=1)!=0)/df.shape[0])

(93239, 8)
PublishDate          0.0
SentimentTitle       0.0
SentimentHeadline    0.0
x0_economy           0.0
x0_microsoft         0.0
x0_obama             0.0
x0_palestine         0.0
BestPlat             0.0
dtype: float64
0.0


In [6]:
# There is no missing value.

In [6]:
# Separate out the feature matrix (X) and the target variable (y).
X = df.drop(columns=['BestPlat'],inplace=False)
y = df['BestPlat']

In [7]:
print(y.value_counts())

1    53721
4    14875
3    12105
0     5745
5     4496
2     2297
Name: BestPlat, dtype: int64


In [8]:
# 0: if there are two or more -1, which means that the news is not posted in any of the platforms. (can be dropped)
# 1: F
# 2: G
# 3: L
# 4: if the news has been posted on all three platform but all three platforms has zero popularity. (indifference)
# 5: all the other samples, f = g > l etc. (can be dropped)

per = y.value_counts()/y.shape
print(per)

1    0.576164
4    0.159536
3    0.129828
0    0.061616
5    0.048220
2    0.024636
Name: BestPlat, dtype: float64


In [9]:
# drop the rows when BestPlat = 0 or 5 (the percentage is not very high so we can do this.)
df = df.set_index('BestPlat')
df = df.drop([0,5], axis=0)
df.reset_index(inplace=True)

# generate new X and y
X = df.drop(columns=['BestPlat'],inplace=False)
y = df['BestPlat']
per = y.value_counts()/y.shape
print(per)

# 1: F
# 2: G
# 3: L
# 4: indifference

1    0.647257
4    0.179221
3    0.145847
2    0.027675
Name: BestPlat, dtype: float64


In [11]:
X.shape

(82998, 7)

In [43]:
# Split my data in a stratified manner into other and test (20% in `test`) 
# Then split other into 5 stratified folds. 
# 4 of those folds will be used for training, the last fold will be CV. 
# Loop through the 5 options the CV fold can be selected.
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from collections import defaultdict

In [44]:
X = X.values 
y = y.values

In [24]:
def ML_pipeline_kfold_LR(X,y,random_state,n_folds=5):
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state, stratify = y)
    kf = StratifiedKFold(n_splits=n_folds, shuffle=True,random_state=random_state)

    CV_scores = []
    test_scores = []

    for train_index, CV_index in kf.split(X_other,y_other):
        X_train, X_CV = X_other[train_index], X_other[CV_index]
        y_train, y_CV = y_other[train_index], y_other[CV_index]

        # tune lasso hyper-parameter, alpha
        alpha = np.logspace(-3,3,5)
        train_score = []
        CV_score = []
        regs = []
        for a in alpha:
            # print("a",a)
            reg = LogisticRegression(penalty='l1', C=1/a, solver="saga", max_iter=10000, multi_class="multinomial")
            reg.fit(X_train, y_train)
            train_score.append(accuracy_score(y_train, reg.predict(X_train)))
            CV_score.append(accuracy_score(y_CV, reg.predict(X_CV)))
            regs.append(reg)
            # print('end')
        # find the best alpha in this fold
        best_alpha = alpha[np.argmax(CV_score)]
        # grab the best model
        reg = regs[np.argmax(CV_score)]
        CV_scores.append(np.max(CV_score))
        # calculate test score using thee best model
        test_scores.append(accuracy_score(y_test, reg.predict(X_test)))
        
    print("best alpha is ", best_alpha)
    return CV_scores, test_scores

In [25]:
test_scores = [] 
for i in range(5): 
    grid, test_score = ML_pipeline_kfold_LR(X,y,i * 610, 5) 
    test_scores.append(test_score) 
print('test Score:', np.around(np.mean(test_scores),2), "+/-", np.around(np.std(test_scores),2)) 

best alpha is  0.001
best alpha is  1.0
best alpha is  1.0
best alpha is  1.0
best alpha is  1.0
test Score: 0.66 +/- 0.0


In [None]:
0.647257

In [26]:
def ML_pipeline_kfold_LR(X,y,random_state,n_folds=5):
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state, stratify = y)
    kf = StratifiedKFold(n_splits=n_folds, shuffle=True,random_state=random_state)

    CV_scores = []
    test_scores = []

    for train_index, CV_index in kf.split(X_other,y_other):
        X_train, X_CV = X_other[train_index], X_other[CV_index]
        y_train, y_CV = y_other[train_index], y_other[CV_index]

        # tune lasso hyper-parameter, alpha
        alpha = np.logspace(-5,5,10)
        train_score = []
        CV_score = []
        regs = []
        for a in alpha:
            # print("a",a)
            reg = LogisticRegression(penalty='l1', C=1/a, solver="saga", max_iter=10000, multi_class="multinomial")
            reg.fit(X_train, y_train)
            train_score.append(accuracy_score(y_train, reg.predict(X_train)))
            CV_score.append(accuracy_score(y_CV, reg.predict(X_CV)))
            regs.append(reg)
            # print('end')
        # find the best alpha in this fold
        best_alpha = alpha[np.argmax(CV_score)]
        # grab the best model
        reg = regs[np.argmax(CV_score)]
        CV_scores.append(np.max(CV_score))
        # calculate test score using thee best model
        test_scores.append(accuracy_score(y_test, reg.predict(X_test)))
        
    print("best alpha is ", best_alpha)
    return CV_scores, test_scores

test_scores = [] 
for i in range(3): 
    grid, test_score = ML_pipeline_kfold_LR(X,y,i * 610, 5) 
    test_scores.append(test_score) 
print('test Score:', np.around(np.mean(test_scores),2), "+/-", np.around(np.std(test_scores),2)) 

best alpha is  3.593813663804626
best alpha is  0.2782559402207126
best alpha is  3.593813663804626
test Score: 0.66 +/- 0.0


In [29]:
print(test_scores)
print(np.std(test_scores))

[[0.6551204819277109, 0.6569879518072289, 0.6553614457831325, 0.6568072289156627, 0.6550602409638554], [0.6571686746987951, 0.6557228915662651, 0.6571686746987951, 0.6571084337349398, 0.6574096385542169], [0.6575301204819277, 0.6569879518072289, 0.6569879518072289, 0.6557228915662651, 0.6560240963855422]]
0.0008430483213251509


In [30]:
import time

In [31]:
def ML_pipeline_kfold_LR(X,y,random_state,n_folds=5):
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state, stratify = y)
    kf = StratifiedKFold(n_splits=n_folds, shuffle=True,random_state=random_state)

    CV_scores = []
    test_scores = []

    for train_index, CV_index in kf.split(X_other,y_other):
        X_train, X_CV = X_other[train_index], X_other[CV_index]
        y_train, y_CV = y_other[train_index], y_other[CV_index]

        # tune lasso hyper-parameter, alpha
        alpha = np.logspace(-5,5,10)
        train_score = []
        CV_score = []
        regs = []
        for a in alpha:
            # print("a",a)
            reg = LogisticRegression(penalty='l1', C=1/a, solver="saga", max_iter=10000, multi_class="multinomial")
            reg.fit(X_train, y_train)
            train_score.append(accuracy_score(y_train, reg.predict(X_train)))
            CV_score.append(accuracy_score(y_CV, reg.predict(X_CV)))
            regs.append(reg)
            # print('end')
        # find the best alpha in this fold
        best_alpha = alpha[np.argmax(CV_score)]
        # grab the best model
        reg = regs[np.argmax(CV_score)]
        CV_scores.append(np.max(CV_score))
        # calculate test score using thee best model
        test_scores.append(accuracy_score(y_test, reg.predict(X_test)))
        
    print("best alpha is ", best_alpha)
    return CV_scores, test_scores

s = time.time()
test_scores = [] 
for i in range(5): 
    grid, test_score = ML_pipeline_kfold_LR(X,y,i * 610, 5) 
    test_scores.append(test_score) 
e = time.time()
print('test Score:', np.around(np.mean(test_scores),2), "+/-", np.around(np.std(test_scores),2)) 
t = e-s
print('time:', t)

best alpha is  3.593813663804626
best alpha is  0.2782559402207126
best alpha is  3.593813663804626
best alpha is  0.2782559402207126
best alpha is  1e-05
test Score: 0.66 +/- 0.0
time: 1291.3945047855377


In [34]:
# i = 5
# alpha = 10

In [32]:
print(test_scores)
print(np.std(test_scores))

[[0.6551204819277109, 0.6569879518072289, 0.6553614457831325, 0.6568072289156627, 0.6550602409638554], [0.6571686746987951, 0.6557228915662651, 0.6571686746987951, 0.6571084337349398, 0.6574096385542169], [0.6575301204819277, 0.6569879518072289, 0.6569879518072289, 0.6557228915662651, 0.6560240963855422], [0.6551204819277109, 0.6551807228915663, 0.656566265060241, 0.6554819277108433, 0.6555421686746988], [0.6542168674698795, 0.6546987951807229, 0.6546385542168675, 0.6546385542168675, 0.6544578313253012]]
0.0010407462958729847


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict
from sklearn.svm import SVC