In [10]:
from string import punctuation
from bs4 import BeautifulSoup
from stop_words import get_stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from IPython.display import display, HTML
import pandas as pd 


#Load and parse data from xml file
def load_data(path):
    doc = open(path,encoding='utf8').read()
    xmldata = BeautifulSoup(doc, "html.parser")

    data = []
    for book in xmldata.findAll('book'):
        parse_xml = BeautifulSoup(str(book),"html.parser")
        blurb = str(parse_xml.find('body').string)
        topcategory = str(parse_xml.find("topic",{"d":"0"}).string)
        data.append((blurb, topcategory))

    return data

#load train data
text_train = load_data('C:\\workspace\\germeval2019t1datasets\\blurbs_train.txt')
blurbs_train = [text[0] for text in text_train]
y = [text[1] for text in text_train]

#Convert to tf-idf vector
stopwords = get_stop_words('de') + list(punctuation)
#vectorizer = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
vectorizer = TfidfVectorizer(stop_words=stopwords)
X = vectorizer.fit_transform(blurbs_train) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)




In [17]:
#===============================   Decision Tree ====================================

def decisiontree(maxdepth=None, minsplit=2, minleaf=1):
    
    #DecisionTreeClassifier(*, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort='deprecated', ccp_alpha=0.0)
    dtclassifier = DecisionTreeClassifier(max_depth=maxdepth,min_samples_split=minsplit, min_samples_leaf=minleaf)
    dtmodel = dtclassifier.fit(X_train, y_train)
    y_prediction = dtmodel.predict(X_test)
    accuracyscore = accuracy_score(y_test, y_prediction)
    #print(classification_report(y_test,y_prediction))

    return accuracyscore


print("Parameter yang diujikan untuk eksperimen pada Decision Tree Classifier")
print("=======================================================================")
dtparameters = {'Parameters' : ['max_depth', 'min_samples_split', 'min_samples_leaf'], 
                'Values' : ['None, 5,10,20,30,40','2, 5, 10','1, 5, 10']
                } 
dtdf = pd.DataFrame(dtparameters) 
display(HTML(dtdf.to_html(index=False))) 

print("Hasil accuracy score dari konfigurasi parameter yang diujikan")
print("=============================================================")

dtresult = {'max_depth':['Default',5,10,20,30,30,40],
           'min_samples_split':['Default',5,5,10,10,10,10],
           'min_samples_leaf':['Default',5,5,5,10,15,15],
           'Accuracy score' :[decisiontree(),decisiontree(5,5,5),decisiontree(10,5,5),decisiontree(20,10,5)
                             ,decisiontree(30,10,10),decisiontree(30,10,15),decisiontree(40,10,15)]}
dtresultdf = pd.DataFrame(dtresult)
display(HTML(dtresultdf.to_html(index=False)))

print("Berdasarkan hasil eksperimen diatas, didapatkan parameter yang optimal")
print("======================================================================")
dtoptimal = {'Parameters' : ['max_depth', 'min_samples_split', 'min_samples_leaf'], 
                'Values' : [30,10,10]
                } 
dtoptimaldf = pd.DataFrame(dtoptimal) 
display(HTML(dtoptimaldf.to_html(index=False))) 


Parameter yang diujikan untuk eksperimen pada Decision Tree Classifier


Parameters,Values
max_depth,"None, 5,10,20,30,40"
min_samples_split,"2, 5, 10"
min_samples_leaf,"1, 5, 10"


Hasil accuracy score dari konfigurasi parameter yang diujikan


max_depth,min_samples_split,min_samples_leaf,Accuracy score
Default,Default,Default,0.594502
5,5,5,0.570103
10,5,5,0.593471
20,10,5,0.605842
30,10,10,0.612027
30,10,15,0.597595
40,10,15,0.59244


Berdasarkan hasil eksperimen diatas, didapatkan parameter yang optimal


Parameters,Values
max_depth,30
min_samples_split,10
min_samples_leaf,10


In [None]:
#================================ Random Forest  ======================================
#RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
def randomforest(n_jobs=None, criterion='gini',n_estimators=100, 
                 min_samples_leaf=1, min_samples_split=2, class_weight=None, 
                 bootstrap=True):
    rfclassifier = RandomForestClassifier(n_estimators=100)
    rfclassifier = rfclassifier.fit(X_train, y_train)
    y_prediction = rfclassifier.predict(X_test)
    #print(classification_report(y_test,y_prediction))
    accuracyscore = accuracy_score(y_test, y_prediction)
    
    return accuracyscore

print("Parameter yang diujikan untuk eksperimen pada Random Foresst Classifier")
print("=======================================================================")
rfparameters = {'Parameters' : ['n_jobs','criterion','n_estimators','min_samples_leaf',
                               'min_samples_split','class_weight','boostrap'], 
                'Values' : ['-1','gini','150,200,250,300','1,2,3','2,4,5,7,16,20',
                           'balanced, balanced_subsample','False']
                } 
rfdf = pd.DataFrame(rfparameters) 
display(HTML(rfdf.to_html(index=False))) 

print("Hasil accuracy score dari konfigurasi parameter yang diujikan")
print("=============================================================")

rfresult = {'n_jobs':[None,-1,-1,-1,-1,-1,-1],
           'criterion' : ['gini','gini','gini','gini','gini','gini','gini'],
           'n_estimators' : [100,150,200,300,300,300,300],
           'min_samples_leaf':[1,2,3,1,1,1,2],
           'min_samples_split' :[2,4,4,5,16,20,7],
           'class_weight' :[None,'balanced','balanced_subsample','balanced_subsample','balanced_subsample','balanced_subsample','balanced_subsample'],
           'bootstrap':[True,False,False,False,False,False,False],
           'Accuracy score':[randomforest(),
                             randomforest(-1,'gini',150,2,4,'balanced',False),
                             randomforest(-1,'gini',200,3,4,'balanced_subsample'),
                             randomforest(-1,'gini',300,1,5,'balanced_subsample'),
                             randomforest(-1,'gini',300,1,16,'balanced_subsample'),
                             randomforest(-1,'gini',300,1,20,'balanced_subsample'),
                             randomforest(-1,'gini',300,2,7,'balanced_subsample'),
                            ]}
rfresultdf = pd.DataFrame(rfresult)
display(HTML(rfresultdf.to_html(index=False)))

print("Berdasarkan hasil eksperimen diatas, didapatkan parameter yang optimal : ")
rfoptimal = {'Parameters' : ['n_jobs','criterion','n_estimators','min_samples_leaf','min_samples_split','class_wight','bootstrap'], 
                'Values' : [-1,'gini',300,1,16,'balanced_subsample',False]
                } 
rfoptimaldf = pd.DataFrame(rfoptimal) 
display(HTML(rfoptimaldf.to_html(index=False))) 

Parameter yang diujikan untuk eksperimen pada Random Foresst Classifier


Parameters,Values
n_jobs,-1
criterion,gini
n_estimators,150200250300
min_samples_leaf,123
min_samples_split,24571620
class_weight,"balanced, balanced_subsample"
boostrap,False


Hasil accuracy score dari konfigurasi parameter yang diujikan


In [26]:
#================================ XGBoost  ======================================
def xgboost(learning_rate=0.3, max_depth=6, min_child_weight=1,gamma=0, colsample_bytree=0.5, n_estimators=100):
    xgbclassifier = xgb.XGBClassifier(learning_rate=learning_rate, max_depth=max_depth, min_child_weight=min_child_weight,
                                      gamma = gamma, colsample_bytree = colsample_bytree,n_estimators=n_estimators, objective="multi:softprob", )
    xgbmodel = xgbclassifier.fit(X_train, y_train)
    y_prediction = xgbmodel.predict(X_test)

    #print(classification_report(y_test,y_prediction))
    accuracyscore = accuracy_score(y_test, y_prediction)
    return accuracyscore


print("Parameter yang diujikan untuk eksperimen pada XGBoost Classifier")
print("=======================================================================")
xgparameters = {'Parameters' : ['learning_rate','max_depth','min_child_weight',
                                'gamma','colsample_bytree','n_estimators'], 
                'Values' : ['0.05, 0.1, 0.3, 0.5, 0.8', 
                            '6, 10, 20, 30', '1,3,5,7',
                           '0.0, 0.1, 0.2, 0.3, 0.4', 
                            '0.3, 0.4, 0.5 , 0.7','100,200,300,500']
                } 
xgdf = pd.DataFrame(xgparameters) 
display(HTML(xgdf.to_html(index=False))) 

print("Hasil accuracy score dari konfigurasi parameter yang diujikan")
print("=============================================================")

xgresult = {'learning_rate':[0.3,0.05, 0.5,0.5,0.5,0.5, 0.8,0.8],
           'max_depth':[6,10,10,20,20,20,30,30],
           'min_child_weight':[1,3,3,5,5,5,7,7],
            'gamma':[0, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3,0.4],
            'colsample_bytree':[0.5,0.3,0.3,0.4,0.4,0.4,0.5,0.7],
            'n_estimators': [100,100,100,100,200,300,300,500],
           'Accuracy score' :[xgboost(), 
                              xgboost(0.05,10,3,0.1,0.3,100), 
                              xgboost(0.5,10,3,0.1,0.3,100),
                              xgboost(0.5,20,5,0.2,0.4,100),
                              xgboost(0.5,20,5,0.2,0.4,200),
                              xgboost(0.5,20,5,0.2,0.4,300),
                              xgboost(0.8,30,7,0.3,0.5,300),
                              xgboost(0.8,30,7,0.4,0.7,500)]}
xgresultdf = pd.DataFrame(xgresult)
display(HTML(xgresultdf.to_html(index=False)))

print("Berdasarkan hasil eksperimen diatas, didapatkan parameter yang optimal")
print("======================================================================")
xgoptimal = {'Parameters' : [], 
                'Values' : []
                } 
xgoptimaldf = pd.DataFrame(xgoptimal) 
display(HTML(xgoptimaldf.to_html(index=False))) 

Parameter yang diujikan untuk eksperimen pada XGBoost Classifier


Parameters,Values
learning_rate,"0.05, 0.1, 0.3, 0.5, 0.8"
max_depth,"6, 10, 20, 30"
min_child_weight,1357
gamma,"0.0, 0.1, 0.2, 0.3, 0.4"
colsample_bytree,"0.3, 0.4, 0.5 , 0.7"
n_estimators,100200300500


Hasil accuracy score dari konfigurasi parameter yang diujikan


learning_rate,max_depth,min_child_weight,gamma,colsample_bytree,n_estimators,Accuracy score
0.3,6,1,0.0,0.5,100,0.718213
0.05,10,3,0.1,0.3,100,0.682131
0.5,10,3,0.1,0.3,100,0.741237
0.5,20,5,0.2,0.4,100,0.728179
0.5,20,5,0.2,0.4,200,0.728522
0.5,20,5,0.2,0.4,300,0.726117
0.8,30,7,0.3,0.5,300,0.71512
0.8,30,7,0.4,0.7,500,0.719931


Berdasarkan hasil eksperimen diatas, didapatkan parameter yang optimal


Parameters,Values


In [None]:
xgboost(0.5,10,3,0.1,0.3,200)
xgboost(0.5,10,3,0.1,0.3,300)
xgboost(0.5,10,3,0.1,0.3,500)