# DEALING WITH META DATA

In [None]:
train_data = pd.read_csv("path_to_train_data")
train_data.head()

In [None]:
train_data.info()

# PREPROCESS

In [None]:
# Fill missing values

train_data["post_message"] = train_data["post_message"].fillna("")
train_data = train_data.fillna(0)

train_data = train_data[pd.to_numeric(train_data['timestamp_post'], errors='coerce').notnull()]

In [None]:
# correct some features
def preprocess(x):
    x = str(x)
    try:
        x = x.split()[0]
        x = int(x)
    except:
        x = 0
    return x

train_data["timestamp_post"] = train_data["timestamp_post"].astype(float)
train_data["num_like_post"] = train_data["num_like_post"].apply(lambda x: preprocess(x)) 
train_data["num_share_post"] = train_data["num_share_post"].apply(lambda x: preprocess(x)) 
train_data["num_comment_post"] = train_data["num_comment_post"].apply(lambda x: preprocess(x)) 

In [None]:
# Get feature from images

def get_num_image(id):
    img_path = os.path.join("public_train_final_images", str(id))
    try:
        num_image = len(os.listdir(img_path))
    except:
        num_image = 0
    return num_image

train_data['num_image'] = [get_num_image(id) for id in train_data['id']] 

In [None]:
# Get more feature from timestamp

from datetime import datetime 

train_data['datetime'] = pd.to_datetime([datetime.fromtimestamp(timestamp)  for timestamp in train_data['timestamp_post']]) 

train_data["minute in hour"] = [x.minute for x in train_data["datetime"].dt.time]
train_data["hour in day"] = [x.hour for x in train_data["datetime"].dt.time]
train_data["day in month"] = [x.day for x in train_data["datetime"].dt.date]
train_data["quarter in year"] = [x.month % 4 for x in train_data["datetime"].dt.date]
train_data["month in year"] = [x.month for x in train_data["datetime"].dt.date]
train_data["weekday"] = [x.weekday() for x in train_data["datetime"].dt.date]

# TRAINING

In [None]:
# Load libraries

from pandas import set_option
# from pandas.tools.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier

from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [None]:
train, valid = train_test_split(train_data, test_size=0.1, random_state=42)

# Score user_name
black_list_user = train.loc[train['label'].apply(lambda x: x == 1)].user_name.value_counts()

black_list_user = black_list_user.to_dict()

def get_score_username(user_name, black_list_user):
    score = 10
    if user_name in black_list_user:
        score -= 5*black_list_user[user_name]
        
    return score

# train['user_score'] = [get_score_username(user_name, black_list_user) for user_name in train['user_name']] 
# valid['user_score'] = [get_score_username(user_name, black_list_user) for user_name in valid['user_name']] 

X_train =  train.drop(["label", "id", "user_name", "post_message", "datetime", "timestamp_post"], axis=1)
Y_train = train["label"]

X_valid =  valid.drop(["label", "id", "user_name", "post_message", "datetime", "timestamp_post"], axis=1)
Y_valid = valid["label"]

mm_scaler = preprocessing.MinMaxScaler()
X_train = mm_scaler.fit_transform(X_train)
X_valid = mm_scaler.fit_transform(X_valid)

lb_binary = preprocessing.LabelBinarizer()
Y_train = lb_binary.fit_transform(Y_train)
Y_valid = lb_binary.fit_transform(Y_valid)

ros = RandomOverSampler(sampling_strategy=0.66, random_state=0)
X_resampled, Y_resampled = ros.fit_resample(X_train, Y_train)

### Use base line

In [None]:
# Spot-Check Algorithms
def GetBasedModel():
    basedModels = []
    basedModels.append(('LR'   , LogisticRegression()))
    basedModels.append(('LDA'  , LinearDiscriminantAnalysis()))
    basedModels.append(('KNN'  , KNeighborsClassifier()))
    basedModels.append(('CART' , DecisionTreeClassifier()))
    basedModels.append(('NB'   , GaussianNB()))
    basedModels.append(('SVM'  , SVC()))
    basedModels.append(('AB'   , AdaBoostClassifier()))
    basedModels.append(('GBM'  , GradientBoostingClassifier()))
    basedModels.append(('RF'   , RandomForestClassifier()))
    basedModels.append(('ET'   , ExtraTreesClassifier()))
    basedModels.append(('MLP'   , MLPClassifier()))
    
    return basedModels

In [None]:
def BasedLine_resampled(X_train, y_train, X_test, y_test, models):

    results = []
    names = []
    for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        cv_results = roc_auc_score(y_test, y_pred)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f" % (name, cv_results)
        print(msg)
        
    return names, results

In [None]:
def BasedLine(X, y, models):
    # Test options and evaluation metric
    num_folds = 10
    scoring = 'roc_auc'

    results = []
    names = []
    for name, model in models:
        kfold = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
        cv_results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
        
    return names, results

In [None]:
models = GetBasedModel()

print("Use K Fold in cross_val_score ----------------------")
names, results = BasedLine(X_train, Y_train, models)
print("Use train test split ---------------------------------")
names, results = BasedLine_resampled(X_resampled, Y_resampled, X_valid, Y_valid, models)

### Stacking

In [None]:
from sklearn.ensemble import StackingClassifier
from numpy import hstack

def get_stacking():
    # define the base models
    # level0 = GetBasedModel()
    level0 = list()
    # level0.append(('LR'   , LogisticRegression()))
    # level0.append(('LDA'  , LinearDiscriminantAnalysis()))
    level0.append(('KNN'  , KNeighborsClassifier()))
    level0.append(('CART' , DecisionTreeClassifier()))
    level0.append(('NB'   , GaussianNB()))
    level0.append(('SVM'  , SVC()))
    # level0.append(('AB'   , AdaBoostClassifier()))
    level0.append(('GBM'  , GradientBoostingClassifier()))
    level0.append(('RF'   , RandomForestClassifier()))
    # level0.append(('ET'   , ExtraTreesClassifier()))
    level0.append(('MLP'   , MLPClassifier()))
    # define meta learner model
    level1 = LogisticRegression()
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    return model

In [None]:
names,results = BasedLine2(X_resampled, Y_resampled, [('stacking', get_stacking())])

names,results = BasedLine1(X_resampled, Y_resampled, X_valid, Y_valid, [('stacking', get_stacking())])

### Blending

In [None]:
def blending(models, X_train, X_val, y_train, y_val):
    # fit all models on the training set and predict on hold out set
    meta_X = list()
    for name, model in models:
        # fit in training set
        model.fit(X_train, y_train)
        # predict on hold out set
        yhat = model.predict(X_val)
        # reshape predictions into a matrix with one column
        yhat = yhat.reshape(len(yhat), 1)
        # store predictions as input for blending
        meta_X.append(yhat)
    # create 2d array from predictions, each set is an input feature
    meta_X = hstack(meta_X)
    # define blending model
    blender = LogisticRegression()
    # fit on predictions from base models
    blender.fit(meta_X, y_val)
    return blender

# make a prediction with the blending ensemble
def predict_ensemble(models, blender, X_test):
    # make predictions with base models
    meta_X = list()
    for name, model in models:
        # predict with base model
        yhat = model.predict(X_test)
        # reshape predictions into a matrix with one column
        yhat = yhat.reshape(len(yhat), 1)
        # store prediction
        meta_X.append(yhat)
    # create 2d array from predictions, each set is an input feature
    meta_X = hstack(meta_X)
    # predict
    return blender.predict(meta_X)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_resampled, Y_resampled, test_size=0.33, random_state=1)

# create the base models
models = GetBasedModel()
# train the blending ensemble
blender = blending(models, X_train, X_val, y_train, y_val)
# make predictions on test set
yhat = predict_ensemble(models, blender, X_valid)
# evaluate predictions
score = roc_auc_score(Y_valid, yhat)
print('Blending ROC_AUC: %.3f' % (score*100))