# Aufgabe


In dem Aufgabenordner finden Sie 2 Unterordner mit Dateien zu den Klassen A und B.  Wir wollen wissen, welche Wortart die besten Merkmale für die Klassifikation liefert.

Machen Sie dazu folgendes:

1. Teilen Sie die Daten in Test und Trainingsdaten ein.
2. Extrahieren Sie alle Substantive aus den Trainingsdaten
3. Nutzen Sie alle Substantive, die mindesten 5 Mal vorkommen als Merkmale und Trainieren Sie einen Klassifikator, der logistische Regression nutzt.
4. Evaluieren Sie den Klassifikator.
5. Wiederholen Sie den Vorgang für Adjekive, Verben und Adverbien.

Adjektive -> ADJA
Verben -> VB
Adverbien -> RB

# Lösung

In [30]:
import nltk
import re
from HanTa import HanoverTagger as ht
from collections import Counter

stopwords = nltk.corpus.stopwords.words('german')
tagger = ht.HanoverTagger('morphmodel_ger.pgz')

def closed_class(pos):
    if pos[0] == '$':
        return True
    # elif pos in ["NE","ADJA","RB","APPR", "APPRART", "APPO", "APZR", "ART", "KOUI", "KOUS", "KON", "KOKOM", "PDS", "PDAT", "PIS", "PIAT", "PIDAT", "PPER", "PPOSS", "PPOSAT", "PRELS", "PRELAT", "PRF", "PWS", "PWAT", "PWAV", "PAV", "PTKZU", "PTKNEG", "VAFIN", "VAIMP", "VAINF", "VAPP", "VMFIN", "VMINF", "VMPP"]:
    #    return True
    elif pos not in ['ADJA','ADJD']:
        return True
    
    return False

def features_from_text(text):
    wordcounts = Counter()
    tlen = 0
    
    satzliste =  nltk.sent_tokenize(text,language='german')
    for satz in satzliste:
        tokens =  nltk.word_tokenize(satz,language='german')
        tokens = [lemma for (word,lemma,pos) in tagger.tag_sent(tokens) if not closed_class(pos)]
        tokens = [t for t in tokens if t.lower() not in stopwords]
        tokens = [t for t in tokens if re.search('^\w+$',t)]
        tlen += len(tokens)
        wordcounts.update(tokens)

    return {w:wordcounts[w]/tlen for w in wordcounts}

In [2]:
import codecs 

def readtext(dateiname):
    text = ''
    d = codecs.open(dateiname,'r','utf8')
    for zeile in d:
        text += str(zeile)
    d.close()

    return text

In [3]:
import glob

def read_data(directories):
    docs = []
    for directory in directories:
        dirsize = 0
        for file in glob.glob(directory+"/*.txt"):
            text = readtext(file)
            if len(text) > 500 and len(text) < 10000:
                docs.append((features_from_text(text),directory))
                dirsize += 1
            if dirsize >= 100:
                break
    return docs

data = read_data(['A','B'])

In [4]:
len(data)

200

In [5]:
import random

random.shuffle(data)
train_data = data[:160]
test_data = data[160:]

In [6]:
print(train_data[0][0])

{'beschrieben': 0.017857142857142856, 'groß': 0.05357142857142857, 'vermutet': 0.017857142857142856, 'paarig': 0.017857142857142856, 'zweiter': 0.017857142857142856, 'zurückgebildet': 0.017857142857142856, 'gesamt': 0.03571428571428571, 'vereint': 0.017857142857142856, 'dünn': 0.017857142857142856, 'ganz': 0.017857142857142856, 'entlegen': 0.03571428571428571, 'nival': 0.017857142857142856, 'niedrig': 0.017857142857142856, 'mittlerer': 0.017857142857142856, 'vorkommend': 0.03571428571428571, 'artenreich': 0.017857142857142856, 'endemisch': 0.017857142857142856, 'alt': 0.05357142857142857, 'neu': 0.017857142857142856, 'primitiv': 0.03571428571428571, 'trocken': 0.017857142857142856, 'faulend': 0.017857142857142856, 'tierisch': 0.017857142857142856, 'pflanzlich': 0.017857142857142856, 'oligophagen': 0.017857142857142856, 'ursprünglich': 0.03571428571428571, 'erdgeschichtlich': 0.017857142857142856, 'überwiegend': 0.03571428571428571, 'nestbildend': 0.017857142857142856, 'verschieden': 0.

In [7]:
docfreq = Counter()
for (wfreq,c) in train_data:
    docfreq.update(wfreq.keys())

In [8]:
docfreq.most_common(10)

[('chemisch', 56),
 ('groß', 44),
 ('klein', 38),
 ('erster', 34),
 ('hoch', 33),
 ('weiß', 32),
 ('zweiter', 31),
 ('weit', 31),
 ('verschieden', 30),
 ('folgend', 26)]

In [22]:
from sklearn import linear_model, datasets


allfeatures = [w for w in docfreq if docfreq[w] > 5]

def make_feat_vec(featmap,featlist):
    vec = []
    for f in featlist:
        vec.append(featmap.get(f,0.0))
    return vec

train_vec =  [make_feat_vec(feats,allfeatures) for feats,cls in train_data]
train_label = [cls for feats,cls in train_data]

In [10]:
train_vec[77]

[0.0,
 0.0625,
 0.0625,
 0.0,
 0.0,
 0.0625,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0625,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0625,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0625,
 0.0625,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0625,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0625,
 0.0,
 0.0625,
 0.0,
 0.0625,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0625]

In [11]:
train_label[77]

'A'

In [23]:
logreg = linear_model.LogisticRegression(C=1e9,verbose=True)
logreg.fit(train_vec,train_label)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


LogisticRegression(C=1000000000.0, verbose=True)

In [24]:
v = make_feat_vec(test_data[5][0],allfeatures) 
logreg.predict([v])

array(['B'], dtype='<U1')

In [16]:
test_data[5][1]

'B'

In [25]:
test_vec = [make_feat_vec(feats,allfeatures) for feats,cls in test_data]
test_label = [cls for feats,cls in test_data]

pred_label = list(logreg.predict(test_vec))

In [26]:
correct = 0
for i in range(len(test_label)):
    if test_label[i] == pred_label[i]:
        correct+=1
print("{0:.1f} Prozent korrekt".format(100* float(correct)/len(test_label)))

92.5 Prozent korrekt


In [27]:
cm = nltk.ConfusionMatrix(test_label, pred_label)
print(cm)

  |  A  B |
--+-------+
A |<16> 3 |
B |  .<21>|
--+-------+
(row = reference; col = test)



In [28]:
from sklearn import neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors = 3)
knn.fit(train_vec,train_label)
pred_label = list(knn.predict(test_vec))
correct = 0
for i in range(len(test_label)):
    if test_label[i] == pred_label[i]:
        correct+=1
print("{0:.1f} Prozent korrekt".format(100* float(correct)/len(test_label)))

75.0 Prozent korrekt


In [29]:
from sklearn import ensemble
rf = ensemble.RandomForestClassifier()
rf.fit(train_vec,train_label)
pred_label = list(rf.predict(test_vec))
correct = 0
for i in range(len(test_label)):
    if test_label[i] == pred_label[i]:
        correct+=1
print("{0:.1f} Prozent korrekt".format(100* float(correct)/len(test_label)))

95.0 Prozent korrekt
