# Aufgabe

In dem Aufgaben finden Sie 2 Dateien. Die Dateien enthalten englische Reviews von Filmen. Die eine Datei enthält negative die andere positive Reviews. 

Wir wollen einen Klassifikator bauen, der vorhersagt, ob ein Review positiv oder negativ ist. Machen Sie dazu folgendes:

1. Teilen Sie die Daten in Test und Trainingsdaten ein.
2. Extrahieren Sie alle Adjektive, Verben und Adverbien aus den Reviews
3. Nutzen Sie alle Adjektive, Verben und Adverbien, die mindesten 5 Mal vorkommen als Merkmale und Trainieren Sie einen Klassifikator, der logistische Regression nutzt.
5. Evaluieren Sie den Klassifikator

# Lösung

In [1]:
import codecs 

def readtext(dateiname):
    text = ''
    d = codecs.open(dateiname,'r','utf8')
    for zeile in d:
        text += str(zeile)
    d.close()

    return text

In [2]:
import glob

def read_data(directories):
    docs = []
    for directory in directories:
        for file in glob.glob(directory+"/*.txt"):
            text = readtext(file)
            docs.append((text,directory))
    return docs

data = read_data(['neg','pos'])

Teilen Sie die Daten in Test und Trainingsdaten ein.

In [3]:
import nltk
import re
from collections import Counter

stopwords = nltk.corpus.stopwords.words('english')

def features_from_text(text, POSlist):
    
    wordcounts = Counter()
    tlen = 0
    
    sentlist = nltk.sent_tokenize(text,language='english')
    for sent in sentlist:
        tokens = nltk.word_tokenize(sent,language='english')
        tokens = [word for (word,pos) in nltk.pos_tag(tokens) if pos in POSlist]
        tokens = [t for t in tokens if t.lower() not in stopwords]
        tokens = [t for t in tokens if re.search('^\w+$',t)]
        wordcounts.update(tokens)
        tlen +=len(tokens)
    return {w:wordcounts[w]/tlen for w in wordcounts}

In [4]:
import random

random.shuffle(data)
train_data_raw = data[1000:]
test_data_raw = data[:1000]

In [5]:
poslist = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ','RB', 'RBR', 'RBS','JJ', 'JJR', 'JJS']
# poslist = ['RB', 'RBR', 'RBS']
# poslist = ['JJ', 'JJR', 'JJS']


test_data = [(features_from_text(text,poslist),cl) for text,cl in test_data_raw]
train_data = [(features_from_text(text,poslist),cl) for text,cl in train_data_raw]

In [6]:
train_data[13]

({'avowed': 0.01694915254237288,
  'ellie': 0.01694915254237288,
  'jodie': 0.01694915254237288,
  'partaking': 0.01694915254237288,
  'different': 0.01694915254237288,
  'say': 0.05084745762711865,
  'exactly': 0.01694915254237288,
  'needs': 0.01694915254237288,
  'solid': 0.01694915254237288,
  'existed': 0.01694915254237288,
  'therefor': 0.01694915254237288,
  'believe': 0.03389830508474576,
  'also': 0.01694915254237288,
  'believes': 0.03389830508474576,
  'spent': 0.01694915254237288,
  'trying': 0.01694915254237288,
  'prove': 0.01694915254237288,
  'exist': 0.01694915254237288,
  'gereally': 0.01694915254237288,
  'less': 0.01694915254237288,
  'believed': 0.01694915254237288,
  'sounding': 0.01694915254237288,
  'fax': 0.01694915254237288,
  'transport': 0.01694915254237288,
  'well': 0.03389830508474576,
  'really': 0.01694915254237288,
  'wants': 0.01694915254237288,
  'go': 0.03389830508474576,
  'denied': 0.01694915254237288,
  'dumb': 0.01694915254237288,
  'discovers':

In [7]:
docfreq = Counter()
for (wfreq,c) in train_data:
    docfreq.update(wfreq.keys())

In [8]:
docfreq.most_common(10)

[('even', 629),
 ('good', 585),
 ('much', 556),
 ('get', 539),
 ('also', 513),
 ('make', 509),
 ('first', 503),
 ('see', 484),
 ('well', 482),
 ('little', 455)]

In [9]:
from sklearn import linear_model, datasets

allfeatures = [w for w in docfreq if docfreq[w] > 4]

def make_feat_vec(featmap,featlist):
    vec = []
    for f in featlist:
        vec.append(featmap.get(f,0.0))
    return vec

train_vec =  [make_feat_vec(feats,allfeatures) for feats,cls in train_data]
train_label = [cls for feats,cls in train_data]

In [10]:
train_vec[86]

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.012658227848101266,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.02531645569620253,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.012658227848101266,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.012658227848101266,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.012658227848101266,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.02531645569620253,
 0.0,
 0.0,
 0.0,
 0.0,
 0.012658227848101266,
 0.0,
 0.0,
 0.0,
 0.0379746835443038,
 0.0,
 0.012658227848101266,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.012658227848101266,
 0.0,
 0.0,
 0.0,
 0.012658227

In [11]:
train_label[86]

'pos'

In [12]:
logreg = linear_model.LogisticRegression(C=1e9,verbose=True, max_iter=2000)
logreg.fit(train_vec,train_label)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s finished


LogisticRegression(C=1000000000.0, max_iter=2000, verbose=True)

In [13]:
test_vec = [make_feat_vec(feats,allfeatures) for feats,cls in test_data]
test_label = [cls for feats,cls in test_data]

pred_label = list(logreg.predict(test_vec))

In [14]:
correct = 0
for i in range(len(test_label)):
    if test_label[i] == pred_label[i]:
        correct+=1
print("{0:.1f} Prozent korrekt".format(100* float(correct)/len(test_label)))

80.5 Prozent korrekt


In [17]:
v = make_feat_vec(test_data[2][0],allfeatures) 
logreg.predict([v])

array(['neg'], dtype='<U3')

In [16]:
test_data[2][1]

'neg'