In [11]:
from string import punctuation
from bs4 import BeautifulSoup
from stop_words import get_stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

#Load and parse data from xml file
#todo : add other info such as title, isbn and date publish as data
def load_data(path):
    doc = open(path,encoding='utf8').read()
    xmldata = BeautifulSoup(doc, "html.parser")

    data = []
    for book in xmldata.findAll('book'):
        parse_xml = BeautifulSoup(str(book),"html.parser")
        blurb = str(parse_xml.find('body').string)
        topcategory = str(parse_xml.find("topic",{"d":"0"}).string)
        data.append((blurb, topcategory))

    return data

#load train data
text_train = load_data('C:\\workspace\\germeval2019t1datasets\\blurbs_train.txt')
blurbs_train = [text[0] for text in text_train]
y = [text[1] for text in text_train]

#Convert to tf-idf vector
stopwords = get_stop_words('de') + list(punctuation)
#vectorizer = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
vectorizer = TfidfVectorizer(stop_words=stopwords)
X = vectorizer.fit_transform(blurbs_train) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)




In [12]:
#===============================   Decision Tree ====================================
#DecisionTreeClassifier(*, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort='deprecated', ccp_alpha=0.0)

dtclassifier = DecisionTreeClassifier()
dtmodel = dtclassifier.fit(X_train, y_train)

#predict
y_prediction = dtmodel.predict(X_test)

#Model evaluation
print("========================= Decision Tree Evaluation ==============================")
print(classification_report(y_test,y_prediction))
print("Accuracy score : ".accuracy_score(y_test, y_prediction))


                            precision    recall  f1-score   support

      Architektur & Garten       0.30      0.21      0.24        29
Ganzheitliches Bewusstsein       0.38      0.22      0.27       139
            Glaube & Ethik       0.35      0.30      0.32        98
   Kinderbuch & Jugendbuch       0.44      0.42      0.43       377
                    Künste       0.38      0.10      0.15        31
  Literatur & Unterhaltung       0.73      0.80      0.76      1490
                  Ratgeber       0.50      0.50      0.50       344
                  Sachbuch       0.40      0.37      0.38       402

                  accuracy                           0.60      2910
                 macro avg       0.43      0.36      0.38      2910
              weighted avg       0.58      0.60      0.59      2910

0.597594501718213


In [6]:
#================================ Random Forest  ======================================
#RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
rfclassifier = RandomForestClassifier(n_estimators=100)
rfclassifier = rfclassifier.fit(X_train, y_train)

#Predict
y_prediction = rfclassifier.predict(X_test)
#print(prediction)

#Model evaluation
print("========================= Random Forest Evaluation ==============================")
print(classification_report(y_test,y_prediction))
print("Accuracy score : ".accuracy_score(y_test, y_prediction))

                            precision    recall  f1-score   support

      Architektur & Garten       0.00      0.00      0.00        29
Ganzheitliches Bewusstsein       0.79      0.14      0.23       139
            Glaube & Ethik       0.89      0.16      0.28        98
   Kinderbuch & Jugendbuch       0.90      0.25      0.39       377
                    Künste       0.00      0.00      0.00        31
  Literatur & Unterhaltung       0.65      0.99      0.78      1490
                  Ratgeber       0.67      0.62      0.64       344
                  Sachbuch       0.68      0.26      0.37       402

                  accuracy                           0.66      2910
                 macro avg       0.57      0.30      0.34      2910
              weighted avg       0.69      0.66      0.60      2910

0.6618556701030928


  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
#================================ XGBoost  ======================================
#AdaBoostClassifier(base_estimator=None, *, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None)
abclassifier = AdaBoostClassifier(n_estimators=100)
abmodel = abclassifier.fit(X_train, y_train)

#predict
y_prediction = abmodel.predict(X_test)

#Model evaluation
print("========================= AdaBoost  Evaluation ==============================")
print(classification_report(y_test,y_prediction))
print("Accuracy score : ".accuracy_score(y_test, y_prediction))

                            precision    recall  f1-score   support

      Architektur & Garten       0.70      0.24      0.36        29
Ganzheitliches Bewusstsein       0.50      0.01      0.01       139
            Glaube & Ethik       0.12      0.03      0.05        98
   Kinderbuch & Jugendbuch       0.20      0.03      0.05       377
                    Künste       0.33      0.16      0.22        31
  Literatur & Unterhaltung       0.59      0.94      0.72      1490
                  Ratgeber       0.40      0.14      0.21       344
                  Sachbuch       0.33      0.26      0.29       402

                  accuracy                           0.54      2910
                 macro avg       0.40      0.23      0.24      2910
              weighted avg       0.46      0.54      0.45      2910

0.540893470790378
