# Testdatensatz einlesen
Starten sie zunächst mit dem (kleineren) Testdatensatz. Das sollte die Berechnungen etwas schneller ablaufen lassen

In [100]:
import pandas as pd
import numpy as np
from normalize_corpus import normalize_corpus
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import model_evaluation_utils_hr as meu
import importlib
importlib.reload(meu)

<module 'model_evaluation_utils_hr' from '/home/yhutter/GitRepos/fhgr-ta/jupyter_notebooks/model_evaluation_utils_hr.py'>

In [32]:
names = []
sentences = []
labels = []
with open("./test.csv", "r", encoding="utf-8") as f:
    content = f.readlines()
    for index, line in enumerate(content):
        result = line.split(";", 1)
        name =  result[0]
        text = result[1]
        labels.append(index)
        names.append(name)
        sentences.append(text)
data_df = pd.DataFrame(list(zip(labels, names, sentences)), columns=["Target Label", "Target Name", "Article"])
data_df.head()

Unnamed: 0,Target Label,Target Name,Article
0,0,Wirtschaft,"'Die Gewerkschaft GPA-djp lanciert den ""All-in..."
1,1,Sport,Franzosen verteidigen 2:1-Führung – Kritische ...
2,2,Web,'Neues Video von Designern macht im Netz die R...
3,3,Sport,23-jähriger Brasilianer muss vier Spiele pausi...
4,4,International,Aufständische verwendeten Chemikalie bei Gefec...


In [33]:
total_nulls = data_df[data_df.Article.str.strip() == ''].shape[0]
print("Empty documents:", total_nulls)

Empty documents: 0


In [34]:
norm_corpus = normalize_corpus(data_df['Article'], contraction_expansion=False)
data_df['Clean Article'] = norm_corpus

In [37]:
data_df.head()

Unnamed: 0,Target Label,Target Name,Article,Clean Article
0,0,Wirtschaft,"'Die Gewerkschaft GPA-djp lanciert den ""All-in...",Gewerkschaft gpadjp lancieren allinrechner fin...
1,1,Sport,Franzosen verteidigen 2:1-Führung – Kritische ...,Franzosen verteidigen fuhrung kritisch Stimme ...
2,2,Web,'Neues Video von Designern macht im Netz die R...,neu Video Designer Netz Runde schlagen etwa bu...
3,3,Sport,23-jähriger Brasilianer muss vier Spiele pausi...,jahriger Brasilianer vier Spiele pausieren Ent...
4,4,International,Aufständische verwendeten Chemikalie bei Gefec...,Aufstandische verwenden Chemikalie Gefecht Aug...


In [38]:
# Save the cleaned result so that we do not have to run it again
data_df.to_csv('clean_test.csv', index=False)

In [39]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028 entries, 0 to 1027
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Target Label   1028 non-null   int64 
 1   Target Name    1028 non-null   object
 2   Article        1028 non-null   object
 3   Clean Article  1028 non-null   object
dtypes: int64(1), object(3)
memory usage: 32.3+ KB


# Try out different Algorithms

In [74]:
train_corpus, test_corpus, train_label_nums, test_label_nums, train_label_names, test_label_names =\
                                 train_test_split(np.array(data_df['Clean Article']), np.array(data_df['Target Label']),
                                                       np.array(data_df['Target Name']), test_size=0.33, random_state=42)

train_corpus.shape, test_corpus.shape

((688,), (340,))

## Bag of Words Methods

In [75]:
# Build BOW features on train articles
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0)
cv_train_features = cv.fit_transform(train_corpus)
cv_test_features = cv.transform(test_corpus)
print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)

BOW model:> Train features shape: (688, 31590)  Test features shape: (340, 31590)


In [76]:
mnb = MultinomialNB(alpha=1)
mnb.fit(cv_train_features, train_label_names)
mnb_bow_cv_scores = cross_val_score(mnb, cv_train_features, train_label_names, cv=5)
mnb_bow_cv_mean_score = np.mean(mnb_bow_cv_scores)
print('CV Accuracy (5-fold):', mnb_bow_cv_scores)
print('Mean CV Accuracy:', mnb_bow_cv_mean_score)
mnb_bow_test_score = mnb.score(cv_test_features, test_label_names)
print('Test Accuracy:', mnb_bow_test_score) #auf den Testwerten

CV Accuracy (5-fold): [0.73913043 0.64492754 0.65217391 0.72992701 0.68613139]
Mean CV Accuracy: 0.6904580556437109
Test Accuracy: 0.6823529411764706


In [77]:
lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=42)
lr.fit(cv_train_features, train_label_names)
lr_bow_cv_scores = cross_val_score(lr, cv_train_features, train_label_names, cv=5)
lr_bow_cv_mean_score = np.mean(lr_bow_cv_scores)
print('CV Accuracy (5-fold):', lr_bow_cv_scores)
print('Mean CV Accuracy:', lr_bow_cv_mean_score)
lr_bow_test_score = lr.score(cv_test_features, test_label_names)
print('Test Accuracy:', lr_bow_test_score)

CV Accuracy (5-fold): [0.65942029 0.69565217 0.66666667 0.67883212 0.67153285]
Mean CV Accuracy: 0.6744208187876864
Test Accuracy: 0.7058823529411765


In [78]:
svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm.fit(cv_train_features, train_label_names)
svm_bow_cv_scores = cross_val_score(svm, cv_train_features, train_label_names, cv=5)
svm_bow_cv_mean_score = np.mean(svm_bow_cv_scores)
print('CV Accuracy (5-fold):', svm_bow_cv_scores)
print('Mean CV Accuracy:', svm_bow_cv_mean_score)
svm_bow_test_score = svm.score(cv_test_features, test_label_names)
print('Test Accuracy:', svm_bow_test_score)

CV Accuracy (5-fold): [0.65942029 0.68115942 0.64492754 0.67883212 0.67883212]
Mean CV Accuracy: 0.6686342959906908
Test Accuracy: 0.6676470588235294




In [79]:
svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=5, random_state=42)
svm_sgd.fit(cv_train_features, train_label_names)
svmsgd_bow_cv_scores = cross_val_score(svm_sgd, cv_train_features, train_label_names, cv=5)
svmsgd_bow_cv_mean_score = np.mean(svmsgd_bow_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_bow_cv_scores)
print('Mean CV Accuracy:', svmsgd_bow_cv_mean_score)
svmsgd_bow_test_score = svm_sgd.score(cv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_bow_test_score)

CV Accuracy (5-fold): [0.69565217 0.57971014 0.67391304 0.65693431 0.64233577]
Mean CV Accuracy: 0.6497090870623082
Test Accuracy: 0.6941176470588235




In [80]:
rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(cv_train_features, train_label_names)
rfc_bow_cv_scores = cross_val_score(rfc, cv_train_features, train_label_names, cv=5)
rfc_bow_cv_mean_score = np.mean(rfc_bow_cv_scores)
print('CV Accuracy (5-fold):', rfc_bow_cv_scores)
print('Mean CV Accuracy:', rfc_bow_cv_mean_score)
rfc_bow_test_score = rfc.score(cv_test_features, test_label_names)
print('Test Accuracy:', rfc_bow_test_score)

CV Accuracy (5-fold): [0.48550725 0.45652174 0.44202899 0.52554745 0.48175182]
Mean CV Accuracy: 0.47827144821749706
Test Accuracy: 0.48823529411764705


In [81]:
gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
gbc.fit(cv_train_features, train_label_names)
gbc_bow_cv_scores = cross_val_score(gbc, cv_train_features, train_label_names, cv=5)
gbc_bow_cv_mean_score = np.mean(gbc_bow_cv_scores)
print('CV Accuracy (5-fold):', gbc_bow_cv_scores)
print('Mean CV Accuracy:', gbc_bow_cv_mean_score)
gbc_bow_test_score = gbc.score(cv_test_features, test_label_names)
print('Test Accuracy:', gbc_bow_test_score)

CV Accuracy (5-fold): [0.66666667 0.55797101 0.55797101 0.56934307 0.54744526]
Mean CV Accuracy: 0.5798794033640114
Test Accuracy: 0.5823529411764706


## TF-IDF Methods

In [82]:
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0)
tv_train_features = tv.fit_transform(train_corpus)

tv_test_features = tv.transform(test_corpus)
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

TFIDF model:> Train features shape: (688, 31590)  Test features shape: (340, 31590)


In [83]:
mnb = MultinomialNB(alpha=1)
mnb.fit(tv_train_features, train_label_names)
mnb_tfidf_cv_scores = cross_val_score(mnb, tv_train_features, train_label_names, cv=5)
mnb_tfidf_cv_mean_score = np.mean(mnb_tfidf_cv_scores)
print('CV Accuracy (5-fold):', mnb_tfidf_cv_scores)
print('Mean CV Accuracy:', mnb_tfidf_cv_mean_score)
mnb_tfidf_test_score = mnb.score(tv_test_features, test_label_names)
print('Test Accuracy:', mnb_tfidf_test_score)

CV Accuracy (5-fold): [0.51449275 0.45652174 0.47101449 0.45985401 0.51824818]
Mean CV Accuracy: 0.4840262350576536
Test Accuracy: 0.5058823529411764


In [84]:
lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=42)
lr.fit(tv_train_features, train_label_names)
lr_tfidf_cv_scores = cross_val_score(lr, tv_train_features, train_label_names, cv=5)
lr_tfidf_cv_mean_score = np.mean(lr_tfidf_cv_scores)
print('CV Accuracy (5-fold):', lr_tfidf_cv_scores)
print('Mean CV Accuracy:', lr_tfidf_cv_mean_score)
lr_tfidf_test_score = lr.score(tv_test_features, test_label_names)
print('Test Accuracy:', lr_tfidf_test_score)

CV Accuracy (5-fold): [0.63768116 0.5942029  0.5942029  0.62043796 0.57664234]
Mean CV Accuracy: 0.6046334496985084
Test Accuracy: 0.65


In [85]:
svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm.fit(tv_train_features, train_label_names)
svm_tfidf_cv_scores = cross_val_score(svm, tv_train_features, train_label_names, cv=5)
svm_tfidf_cv_mean_score = np.mean(svm_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svm_tfidf_cv_scores)
print('Mean CV Accuracy:', svm_tfidf_cv_mean_score)
svm_tfidf_test_score = svm.score(tv_test_features, test_label_names)
print('Test Accuracy:', svm_tfidf_test_score)

CV Accuracy (5-fold): [0.7826087  0.76086957 0.73188406 0.75912409 0.70072993]
Mean CV Accuracy: 0.7470432666878241
Test Accuracy: 0.8058823529411765




In [86]:
svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=5, random_state=42)
svm_sgd.fit(tv_train_features, train_label_names)
svmsgd_tfidf_cv_scores = cross_val_score(svm_sgd, tv_train_features, train_label_names, cv=5)
svmsgd_tfidf_cv_mean_score = np.mean(svmsgd_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_tfidf_cv_scores)
print('Mean CV Accuracy:', svmsgd_tfidf_cv_mean_score)
svmsgd_tfidf_test_score = svm_sgd.score(tv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_tfidf_test_score)

CV Accuracy (5-fold): [0.75362319 0.72463768 0.74637681 0.7080292  0.72262774]
Mean CV Accuracy: 0.7310589230931979
Test Accuracy: 0.7852941176470588




In [87]:
rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(tv_train_features, train_label_names)
rfc_tfidf_cv_scores = cross_val_score(rfc, tv_train_features, train_label_names, cv=5)
rfc_tfidf_cv_mean_score = np.mean(rfc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', rfc_tfidf_cv_scores)
print('Mean CV Accuracy:', rfc_tfidf_cv_mean_score)
rfc_tfidf_test_score = rfc.score(tv_test_features, test_label_names)
print('Test Accuracy:', rfc_tfidf_test_score)

CV Accuracy (5-fold): [0.51449275 0.45652174 0.51449275 0.51824818 0.42335766]
Mean CV Accuracy: 0.485422617158574
Test Accuracy: 0.45


In [88]:
gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
gbc.fit(tv_train_features, train_label_names)
gbc_tfidf_cv_scores = cross_val_score(gbc, tv_train_features, train_label_names, cv=5)
gbc_tfidf_cv_mean_score = np.mean(gbc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', gbc_tfidf_cv_scores)
print('Mean CV Accuracy:', gbc_tfidf_cv_mean_score)
gbc_tfidf_test_score = gbc.score(tv_test_features, test_label_names)
print('Test Accuracy:', gbc_tfidf_test_score)

CV Accuracy (5-fold): [0.60869565 0.56521739 0.54347826 0.59854015 0.60583942]
Mean CV Accuracy: 0.5843541732783244
Test Accuracy: 0.5617647058823529


## Ergebnisse

In [89]:
pd.DataFrame([['Naive Bayes', mnb_bow_cv_mean_score, mnb_bow_test_score, 
               mnb_tfidf_cv_mean_score, mnb_tfidf_test_score],
              ['Logistic Regression', lr_bow_cv_mean_score, lr_bow_test_score, 
               lr_tfidf_cv_mean_score, lr_tfidf_test_score],
              ['Linear SVM', svm_bow_cv_mean_score, svm_bow_test_score, 
               svm_tfidf_cv_mean_score, svm_tfidf_test_score],
              ['Linear SVM (SGD)', svmsgd_bow_cv_mean_score, svmsgd_bow_test_score, 
               svmsgd_tfidf_cv_mean_score, svmsgd_tfidf_test_score],
              ['Random Forest', rfc_bow_cv_mean_score, rfc_bow_test_score, 
               rfc_tfidf_cv_mean_score, rfc_tfidf_test_score],
              ['Gradient Boosted Machines', gbc_bow_cv_mean_score, gbc_bow_test_score, 
               gbc_tfidf_cv_mean_score, gbc_tfidf_test_score]],
             columns=['Model', 'CV Score (TF)', 'Test Score (TF)', 'CV Score (TF-IDF)', 'Test Score (TF-IDF)'],
             ).T

Unnamed: 0,0,1,2,3,4,5
Model,Naive Bayes,Logistic Regression,Linear SVM,Linear SVM (SGD),Random Forest,Gradient Boosted Machines
CV Score (TF),0.690458,0.674421,0.668634,0.649709,0.478271,0.579879
Test Score (TF),0.682353,0.705882,0.667647,0.694118,0.488235,0.582353
CV Score (TF-IDF),0.484026,0.604633,0.747043,0.731059,0.485423,0.584354
Test Score (TF-IDF),0.505882,0.65,0.805882,0.785294,0.45,0.561765


# Modell trainieren ohne Hyperparameter Tuning

In [90]:
svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm.fit(tv_train_features, train_label_names)

svm_tfidf_test_score = svm.score(tv_test_features, test_label_names)
print('Test Accuracy:', svm_tfidf_test_score)

Test Accuracy: 0.8058823529411765




# Modell trainieren mit Hyperparameter Tuning

In [96]:
svm_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                        ('svm', LinearSVC(random_state=42))
                       ])

param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'svm__C': [0.01, 0.1, 1, 5]
}

gs_svm = GridSearchCV(svm_pipeline, param_grid, cv=5, verbose=2)
gs_svm = gs_svm.fit(train_corpus, train_label_names)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END .............svm__C=0.01, tfidf__ngram_range=(1, 1); total time=   0.1s
[CV] END .............svm__C=0.01, tfidf__ngram_range=(1, 1); total time=   0.1s




[CV] END .............svm__C=0.01, tfidf__ngram_range=(1, 1); total time=   0.1s
[CV] END .............svm__C=0.01, tfidf__ngram_range=(1, 1); total time=   0.1s
[CV] END .............svm__C=0.01, tfidf__ngram_range=(1, 1); total time=   0.1s




[CV] END .............svm__C=0.01, tfidf__ngram_range=(1, 2); total time=   0.2s




[CV] END .............svm__C=0.01, tfidf__ngram_range=(1, 2); total time=   0.2s




[CV] END .............svm__C=0.01, tfidf__ngram_range=(1, 2); total time=   0.2s




[CV] END .............svm__C=0.01, tfidf__ngram_range=(1, 2); total time=   0.2s




[CV] END .............svm__C=0.01, tfidf__ngram_range=(1, 2); total time=   0.2s
[CV] END ..............svm__C=0.1, tfidf__ngram_range=(1, 1); total time=   0.1s
[CV] END ..............svm__C=0.1, tfidf__ngram_range=(1, 1); total time=   0.1s




[CV] END ..............svm__C=0.1, tfidf__ngram_range=(1, 1); total time=   0.1s
[CV] END ..............svm__C=0.1, tfidf__ngram_range=(1, 1); total time=   0.1s
[CV] END ..............svm__C=0.1, tfidf__ngram_range=(1, 1); total time=   0.1s




[CV] END ..............svm__C=0.1, tfidf__ngram_range=(1, 2); total time=   0.2s




[CV] END ..............svm__C=0.1, tfidf__ngram_range=(1, 2); total time=   0.2s




[CV] END ..............svm__C=0.1, tfidf__ngram_range=(1, 2); total time=   0.3s




[CV] END ..............svm__C=0.1, tfidf__ngram_range=(1, 2); total time=   0.2s




[CV] END ..............svm__C=0.1, tfidf__ngram_range=(1, 2); total time=   0.2s
[CV] END ................svm__C=1, tfidf__ngram_range=(1, 1); total time=   0.1s
[CV] END ................svm__C=1, tfidf__ngram_range=(1, 1); total time=   0.1s




[CV] END ................svm__C=1, tfidf__ngram_range=(1, 1); total time=   0.1s
[CV] END ................svm__C=1, tfidf__ngram_range=(1, 1); total time=   0.1s
[CV] END ................svm__C=1, tfidf__ngram_range=(1, 1); total time=   0.1s




[CV] END ................svm__C=1, tfidf__ngram_range=(1, 2); total time=   0.2s




[CV] END ................svm__C=1, tfidf__ngram_range=(1, 2); total time=   0.2s




[CV] END ................svm__C=1, tfidf__ngram_range=(1, 2); total time=   0.3s




[CV] END ................svm__C=1, tfidf__ngram_range=(1, 2); total time=   0.3s




[CV] END ................svm__C=1, tfidf__ngram_range=(1, 2); total time=   0.2s
[CV] END ................svm__C=5, tfidf__ngram_range=(1, 1); total time=   0.1s
[CV] END ................svm__C=5, tfidf__ngram_range=(1, 1); total time=   0.1s




[CV] END ................svm__C=5, tfidf__ngram_range=(1, 1); total time=   0.1s
[CV] END ................svm__C=5, tfidf__ngram_range=(1, 1); total time=   0.1s
[CV] END ................svm__C=5, tfidf__ngram_range=(1, 1); total time=   0.1s




[CV] END ................svm__C=5, tfidf__ngram_range=(1, 2); total time=   0.3s




[CV] END ................svm__C=5, tfidf__ngram_range=(1, 2); total time=   0.3s




[CV] END ................svm__C=5, tfidf__ngram_range=(1, 2); total time=   0.3s




[CV] END ................svm__C=5, tfidf__ngram_range=(1, 2); total time=   0.3s
[CV] END ................svm__C=5, tfidf__ngram_range=(1, 2); total time=   0.3s




In [98]:
gs_svm.best_estimator_.get_params()

{'memory': None,
 'steps': [('tfidf', TfidfVectorizer()),
  ('svm', LinearSVC(C=5, random_state=42))],
 'verbose': False,
 'tfidf': TfidfVectorizer(),
 'svm': LinearSVC(C=5, random_state=42),
 'tfidf__analyzer': 'word',
 'tfidf__binary': False,
 'tfidf__decode_error': 'strict',
 'tfidf__dtype': numpy.float64,
 'tfidf__encoding': 'utf-8',
 'tfidf__input': 'content',
 'tfidf__lowercase': True,
 'tfidf__max_df': 1.0,
 'tfidf__max_features': None,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1),
 'tfidf__norm': 'l2',
 'tfidf__preprocessor': None,
 'tfidf__smooth_idf': True,
 'tfidf__stop_words': None,
 'tfidf__strip_accents': None,
 'tfidf__sublinear_tf': False,
 'tfidf__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidf__tokenizer': None,
 'tfidf__use_idf': True,
 'tfidf__vocabulary': None,
 'svm__C': 5,
 'svm__class_weight': None,
 'svm__dual': 'warn',
 'svm__fit_intercept': True,
 'svm__intercept_scaling': 1,
 'svm__loss': 'squared_hinge',
 'svm__max_iter': 1000,
 'svm__multi_class': 'ovr'

In [99]:
cv_results = gs_svm.cv_results_
results_df = pd.DataFrame({'rank': cv_results['rank_test_score'],
                           'params': cv_results['params'], 
                           'cv score (mean)': cv_results['mean_test_score'], 
                           'cv score (std)': cv_results['std_test_score']} 
              )
results_df = results_df.sort_values(by=['rank'], ascending=True)
pd.set_option('display.max_colwidth', 100)
results_df

Unnamed: 0,rank,params,cv score (mean),cv score (std)
6,1,"{'svm__C': 5, 'tfidf__ngram_range': (1, 1)}",0.760129,0.022071
4,2,"{'svm__C': 1, 'tfidf__ngram_range': (1, 1)}",0.754311,0.027434
7,3,"{'svm__C': 5, 'tfidf__ngram_range': (1, 2)}",0.751433,0.015186
5,4,"{'svm__C': 1, 'tfidf__ngram_range': (1, 2)}",0.742706,0.01799
2,5,"{'svm__C': 0.1, 'tfidf__ngram_range': (1, 1)}",0.670052,0.026234
3,6,"{'svm__C': 0.1, 'tfidf__ngram_range': (1, 2)}",0.635206,0.025666
0,7,"{'svm__C': 0.01, 'tfidf__ngram_range': (1, 1)}",0.286332,0.022186
1,8,"{'svm__C': 0.01, 'tfidf__ngram_range': (1, 2)}",0.20786,0.010037


In [97]:
best_svm_test_score = gs_svm.score(test_corpus, test_label_names)
print('Test Accuracy :', best_svm_test_score)

Test Accuracy : 0.8088235294117647


In [101]:
svm_predictions = gs_svm.predict(test_corpus)
unique_classes = list(set(test_label_names))
meu.get_metrics(true_labels=test_label_names, predicted_labels=svm_predictions)

Accuracy: 0.8088
Precision: 0.8181
Recall: 0.8088
F1 Score: 0.8107


In [102]:
meu.display_classification_report(true_labels=test_label_names, 
                                  predicted_labels=svm_predictions, classes=unique_classes)

               precision    recall  f1-score   support

        Sport       1.00      0.93      0.97        45
       Kultur       0.84      0.67      0.74        24
          Web       0.90      0.86      0.88        64
     Panorama       0.71      0.78      0.74        46
       Inland       0.74      0.81      0.77        31
   Wirtschaft       0.68      0.82      0.74        44
         Etat       0.85      0.71      0.77        24
International       0.78      0.76      0.77        50
 Wissenschaft       0.91      0.83      0.87        12

     accuracy                           0.81       340
    macro avg       0.82      0.80      0.81       340
 weighted avg       0.82      0.81      0.81       340



In [104]:
meu.display_confusion_matrix_pretty(true_labels=test_label_names, 
                                    predicted_labels=svm_predictions, classes=unique_classes)

Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:
Unnamed: 0_level_1,Unnamed: 1_level_1,Sport,Kultur,Web,Panorama,Inland,Wirtschaft,Etat,International,Wissenschaft
Actual:,Sport,17,2,2,2,1,0,0,0,0
Actual:,Kultur,0,25,1,0,2,0,0,3,0
Actual:,Web,1,1,38,0,5,0,1,4,0
Actual:,Panorama,1,1,0,16,3,0,1,2,0
Actual:,Inland,0,2,2,1,36,0,1,4,0
Actual:,Wirtschaft,0,0,2,0,0,42,0,1,0
Actual:,Etat,0,1,2,0,2,0,55,3,1
Actual:,International,1,2,1,0,2,0,2,36,0
Actual:,Wissenschaft,0,0,1,0,0,0,1,0,10
