# Aufgabe


In dem Aufgabenordner finden Sie 2 Unterordner mit Dateien zu den Klassen A und B.  Wir wollen wissen, welche Wortart die besten Merkmale für die Klassifikation liefert.

Machen Sie dazu folgendes:

1. Teilen Sie die Daten in Test und Trainingsdaten ein.
2. Extrahieren Sie alle Substantive aus den Trainingsdaten
3. Nutzen Sie alle Substantive, die mindesten 5 Mal vorkommen als Merkmale und Trainieren Sie einen Klassifikator, der logistische Regression nutzt.
4. Evaluieren Sie den Klassifikator.
5. Wiederholen Sie den Vorgang für Adjekive, Verben und Adverbien.

Adjektive -> ADJA
Verben -> VB
Adverbien -> RB

# Lösung

In [1]:
import codecs 

def readtext(dateiname):
    text = ''
    d = codecs.open(dateiname,'r','utf8')
    for zeile in d:
        text += str(zeile)
    d.close()

    return text

In [50]:
import glob

def read_data(directories):
    docs = []
    for directory in directories:
        dirsize = 0
        for file in glob.glob(directory+"/*.txt"):
            text = readtext(file)
            if len(text) > 500 and len(text) < 10000:
                docs.append((features_from_text(text),directory))
                dirsize += 1
            if dirsize >= 100:
                break
    return docs

data = read_data(['A','B'])

In [51]:
bsptext= readtext('A/Agathiphaga.txt')
print(bsptext[:300])

Kauri-Motten (Agathiphaga) sind eine Gattung der Schmetterlinge, sie sind die einzigen Vertreter der ursprünglichen Familie Agathiphagidae. Wegen ihrer isolierten Stellung werden sie sogar in eine eigene Überfamilie Agathiphagoidea, und in eine eigene Unterordnung Aglossata, gestellt. Nächst zu den 


In [52]:
import nltk
import re
from HanTa import HanoverTagger as ht
from collections import Counter

stopwords = nltk.corpus.stopwords.words('german')
tagger = ht.HanoverTagger('morphmodel_ger.pgz')

def closed_class(pos):
    if pos[0] == '$':
        return True
    elif pos in ["APPR", "APPRART", "APPO", "APZR", "ART", "KOUI", "KOUS", "KON", "KOKOM", "PDS", "PDAT", "PIS", "PIAT", "PIDAT", "PPER", "PPOSS", "PPOSAT", "PRELS", "PRELAT", "PRF", "PWS", "PWAT", "PWAV", "PAV", "PTKZU", "PTKNEG", "VAFIN", "VAIMP", "VAINF", "VAPP", "VMFIN", "VMINF", "VMPP"]:
        return True
    elif pos not in ['NE']:
        return True
    
    return False

def features_from_text(text):
    wordcounts = Counter()
    tlen = 0
    
    satzliste =  nltk.sent_tokenize(text,language='german')
    for satz in satzliste:
        tokens =  nltk.word_tokenize(satz,language='german')
        tokens = [lemma for (word,lemma,pos) in tagger.tag_sent(tokens) if not closed_class(pos)]
        tokens = [t for t in tokens if t.lower() not in stopwords]
        tokens = [t for t in tokens if re.search('^\w+$',t)]
        tlen += len(tokens)
        wordcounts.update(tokens)

    return {w:wordcounts[w]/tlen for w in wordcounts}

In [55]:
len(data)

200

In [56]:
import random

random.shuffle(data)
train_data = data[:160]
test_data = data[160:]

In [57]:
print(train_data[0][0])

{'Urania': 0.15942028985507245, 'Nachtfalter': 0.014492753623188406, 'Uraniidae': 0.014492753623188406, 'Jamaika': 0.028985507246376812, 'Heller': 0.014492753623188406, 'Hinterleibes': 0.014492753623188406, 'Philip': 0.043478260869565216, 'Henry': 0.043478260869565216, 'Blue': 0.014492753623188406, 'Mountain': 0.014492753623188406, 'Primärwald': 0.014492753623188406, 'Raupen': 0.028985507246376812, 'Omphalea': 0.028985507246376812, 'Triandra': 0.014492753623188406, 'Portland': 0.014492753623188406, 'Parish': 0.014492753623188406, 'Hans': 0.014492753623188406, 'Sloane': 0.014492753623188406, 'Carl': 0.014492753623188406, 'Linné': 0.014492753623188406, 'Pieter': 0.014492753623188406, 'Cramer': 0.014492753623188406, 'Jacob': 0.014492753623188406, 'Hübner': 0.014492753623188406, 'S': 0.043478260869565216, 'Online': 0.043478260869565216, 'Ii': 0.014492753623188406, 'Pupa': 0.014492753623188406, 'David': 0.014492753623188406, 'Lee': 0.014492753623188406, 'Neal': 0.014492753623188406, 'Smith'

In [58]:
docfreq = Counter()
for (wfreq,c) in train_data:
    docfreq.update(wfreq.keys())

In [59]:
docfreq.most_common(10)

[('Raupen', 59),
 ('O', 55),
 ('Isbn', 49),
 ('Lepidoptera', 38),
 ('Europa', 36),
 ('De', 28),
 ('Ii', 26),
 ('Gruyter', 25),
 ('Berlin', 23),
 ('Ca', 23)]

In [60]:
from sklearn import linear_model, datasets


allfeatures = [w for w in docfreq if docfreq[w] > 4]

def make_feat_vec(featmap,featlist):
    vec = []
    for f in featlist:
        vec.append(featmap.get(f,0.0))
    return vec

train_vec =  [make_feat_vec(feats,allfeatures) for feats,cls in train_data]
train_label = [cls for feats,cls in train_data]

In [61]:
train_vec[77]

[0.0,
 0.0,
 0.08823529411764706,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.029411764705882353,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.029411764705882353,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.029411764705882353,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.029411764705882353,
 0.0,
 0.0]

In [62]:
train_label[77]

'A'

In [63]:
logreg = linear_model.LogisticRegression(C=1e9,verbose=True)
logreg.fit(train_vec,train_label)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


LogisticRegression(C=1000000000.0, verbose=True)

In [64]:
v = make_feat_vec(test_data[5][0],allfeatures) 
logreg.predict([v])

array(['A'], dtype='<U1')

In [65]:
len(test_data)

40

In [66]:
test_vec = [make_feat_vec(feats,allfeatures) for feats,cls in test_data]
test_label = [cls for feats,cls in test_data]

pred_label = list(logreg.predict(test_vec))

In [67]:
correct = 0
for i in range(len(test_label)):
    if test_label[i] == pred_label[i]:
        correct+=1
print("{0:.1f} Prozent korrekt".format(100* float(correct)/len(test_label)))

100.0 Prozent korrekt


In [68]:
cm = nltk.ConfusionMatrix(test_label, pred_label)
print(cm)

  |  A  B |
--+-------+
A |<21> . |
B |  .<19>|
--+-------+
(row = reference; col = test)



In [69]:
from sklearn import neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors = 3)
knn.fit(train_vec,train_label)
pred_label = list(knn.predict(test_vec))
correct = 0
for i in range(len(test_label)):
    if test_label[i] == pred_label[i]:
        correct+=1
print("{0:.1f} Prozent korrekt".format(100* float(correct)/len(test_label)))

95.0 Prozent korrekt


In [70]:
from sklearn import ensemble
rf = ensemble.RandomForestClassifier()
rf.fit(train_vec,train_label)
pred_label = list(rf.predict(test_vec))
correct = 0
for i in range(len(test_label)):
    if test_label[i] == pred_label[i]:
        correct+=1
print("{0:.1f} Prozent korrekt".format(100* float(correct)/len(test_label)))

100.0 Prozent korrekt
