In [1]:
#import package
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

In [2]:
#load data
train=pd.read_csv("train.csv")
X=train["tweet"]
Y=train["class"]

In [3]:
#score
def score(pred_Y,test_Y):
    
    #AllF
    list_score_all=f1_score(pred_Y,test_Y,average=None)
    score_all=np.mean(list_score_all)
    
    #HateF
    h_pred_Y=pred_Y.copy()
    h_test_Y=test_Y.copy()
    h_pred_Y[h_pred_Y==2]=1
    h_test_Y[h_test_Y==2]=1
    list_score_hate=f1_score(h_pred_Y,h_test_Y,average=None)
    score_hate=np.mean(list_score_hate)
    
#     print("HateF:",score_hate)
#     print("AllF:",score_all)
#     print("Final:",0.6*score_hate+0.4*score_all)
    score_final=0.6*score_hate+0.4*score_all
    return score_final

In [8]:
#cross validation
list_depth=[4,6,8]
list_gamma=[0.01,0.05,0.1,0.15,0.2]
for depth in list_depth:
    for gamma in list_gamma:
        k=5
        n_samples=len(X)
        fold_size=n_samples//k
        scores=[]
        masks=[]
        for fold in range(k):

            #generate a boolean mask for the test set in this fold
            test_mask=np.zeros(n_samples,dtype=bool)
            test_mask[fold*fold_size:(fold+1)*fold_size]=True

            #create training and testing sets using this mask
            test_X,test_Y=X[test_mask],Y[test_mask]
            train_X,train_Y=X[~test_mask],Y[~test_mask]

            #TfidfVectorizer
            tfidf=TfidfVectorizer()
            tfidf.fit(train_X)
            train_X=tfidf.transform(train_X)
            test_X=tfidf.transform(test_X)

            #training
            clf=XGBClassifier(eval_metric='mlogloss',use_label_encoder=False,max_depth=depth,gamma=gamma)
            clf.fit(train_X,train_Y)
            pred_Y=clf.predict(test_X)
            scores.append(score(pred_Y,test_Y))
        print("depth:",depth,"gamma:",gamma)
        print("mean:",np.mean(scores))
        print("var:",np.var(scores))
        print("\n")

depth: 4 gamma: 0.01
mean: 0.6602982365033391
var: 0.0004115144900970859


depth: 4 gamma: 0.05
mean: 0.6596486850728319
var: 0.00037726687162030904


depth: 4 gamma: 0.1
mean: 0.6561513870325127
var: 0.0004503910607779933


depth: 4 gamma: 0.15
mean: 0.6552704180952007
var: 0.00039962740632588044


depth: 4 gamma: 0.2
mean: 0.652595483747314
var: 0.00026905130757560856


depth: 6 gamma: 0.01
mean: 0.6631832396942174
var: 0.00012819133874629988


depth: 6 gamma: 0.05
mean: 0.6633819597772858
var: 0.0002344005384312185


depth: 6 gamma: 0.1
mean: 0.6649232053280147
var: 0.00031626124250765434


depth: 6 gamma: 0.15
mean: 0.6631654253732635
var: 0.00017882940414277673


depth: 6 gamma: 0.2
mean: 0.6622229431258451
var: 0.000242994115025024


depth: 8 gamma: 0.01
mean: 0.6634171094872934
var: 0.00022686323222655515


depth: 8 gamma: 0.05
mean: 0.6623283305702428
var: 5.772871409937006e-05


depth: 8 gamma: 0.1
mean: 0.6592899047095953
var: 9.566957050936723e-05


depth: 8 gamma: 0.15
mean