# Document classifier

## Daten
- Wir brauchen zuerst daten um unser Modell zu trainieren

In [21]:
from textblob.classifiers import NaiveBayesClassifier

train = [
    ('I love this sandwich.', 'pos'),
    ('This is an amazing place!', 'pos'),
    ('I feel very good about these beers.', 'pos'),
    ('This is my best work.', 'pos'),
    ("What an awesome view", 'pos'),
    ('I do not like this restaurant', 'neg'),
    ('I am tired of this stuff.', 'neg'),
    ("I can't deal with this", 'neg'),
    ('He is my sworn enemy!', 'neg'),
    ('My boss is horrible.', 'neg')
    #('FCB','neg'),
    #('Basel','neg')
]
test = [
    ('The beer was good.', 'pos'),
    ('I do not enjoy my job', 'neg'),
    ("I ain't feeling dandy today.", 'neg'),
    ("I feel amazing!", 'pos'),
    ('Gary is a friend of mine.', 'pos'),
    ("I can't believe I'm doing this.", 'neg')
]

## Training

In [22]:
cl = NaiveBayesClassifier(train)

## Test
- Wie gut performed unser Modell bei Daten die es noch nie gesehen hat?

In [23]:
cl.accuracy(test)

0.8333333333333334

- Zu 80% korrekt, ok für mich :)

## Features
- Welche wörter sorgen am meisten dafür dass etwas positiv oder negativ klassifiziert wird?

In [24]:
cl.show_informative_features(5)

Most Informative Features
          contains(this) = True              neg : pos    =      2.3 : 1.0
          contains(this) = False             pos : neg    =      1.8 : 1.0
          contains(This) = False             neg : pos    =      1.6 : 1.0
            contains(an) = False             neg : pos    =      1.6 : 1.0
             contains(I) = False             pos : neg    =      1.4 : 1.0


Er ist der meinung wenn "this" vorkommt ist es eher positiv, was natürlich quatsch ist, aber das hat er nun mal so gelernt, deswegen braucht ihr gute trainingsdaten. 

## Klassifizierung

In [25]:
cl.classify("Their burgers are amazing")  # "pos"

'pos'

In [32]:
cl.classify("I don't like their pizza.")  # "neg"
cl.classify("this fcb")

'neg'

### Klassizierung nach Sätzen

In [34]:
from textblob import TextBlob
blob = TextBlob("The beer was amazing. "
                "But the hangover was horrible. My boss was not happy.",
                classifier=cl)


In [36]:
for sentence in blob.sentences:
    print(("%s (%s)") % (sentence,sentence.classify()))

The beer was amazing. (pos)
But the hangover was horrible. (neg)
My boss was not happy. (neg)


## Mit schweizer Songtexten Kommentare klassifizieren

In [38]:
import os
import glob
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize


from io import open


train = []
countries = ["schweiz", "deutschland"]
for country in countries:
    out = []
    folder_path = 'songtexte/%s' % country 
    for filename in glob.glob(os.path.join(folder_path, '*.txt')):
      with open(filename, 'r') as f:
        text = f.read()
        words = word_tokenize(text)
        words=[word.lower() for word in words if word.isalpha()]
        for word in words:
            out.append(word)
    out = set(out)
    for word in out:
        train.append((word,country))
        #print (filename)
        #print (len(text))
train

[('cho', 'schweiz'),
 ('gö', 'schweiz'),
 ('de', 'schweiz'),
 ('sing', 'schweiz'),
 ('eim', 'schweiz'),
 ('es', 'schweiz'),
 ('strecke', 'schweiz'),
 ('derzue', 'schweiz'),
 ('schell', 'schweiz'),
 ('uftue', 'schweiz'),
 ('ine', 'schweiz'),
 ('du', 'schweiz'),
 ('gange', 'schweiz'),
 ('zue', 'schweiz'),
 ('alles', 'schweiz'),
 ('gumpe', 'schweiz'),
 ('ume', 'schweiz'),
 ('schwänzli', 'schweiz'),
 ('cha', 'schweiz'),
 ('und', 'schweiz'),
 ('müed', 'schweiz'),
 ('ding', 'schweiz'),
 ('schnäderet', 'schweiz'),
 ('hei', 'schweiz'),
 ('chöme', 'schweiz'),
 ('ring', 'schweiz'),
 ('us', 'schweiz'),
 ('d', 'schweiz'),
 ('tanz', 'schweiz'),
 ('eis', 'schweiz'),
 ('arme', 'schweiz'),
 ('si', 'schweiz'),
 ('äntli', 'schweiz'),
 ('affe', 'schweiz'),
 ('dr', 'schweiz'),
 ('uf', 'schweiz'),
 ('oder', 'schweiz'),
 ('bruchsch', 'schweiz'),
 ('wil', 'schweiz'),
 ('winke', 'schweiz'),
 ('wisli', 'schweiz'),
 ('drum', 'schweiz'),
 ('s', 'schweiz'),
 ('ou', 'schweiz'),
 ('wasser', 'schweiz'),
 ('ärde', 's

In [39]:
from textblob.classifiers import NaiveBayesClassifier
c2 = NaiveBayesClassifier(train)

In [40]:
c2.classify("Ich gehe durch den Wald")  # "deutsch"

'deutschland'

In [50]:
c2.classify("es du bei und zur")  # "deutsch"


'schweiz'

In [49]:
c2.show_informative_features(5)

Most Informative Features
            contains(es) = True           schwei : deutsc =      1.3 : 1.0
            contains(du) = True           schwei : deutsc =      1.3 : 1.0
           contains(bei) = True           schwei : deutsc =      1.3 : 1.0
           contains(und) = True           schwei : deutsc =      1.3 : 1.0
           contains(zur) = True           schwei : deutsc =      1.3 : 1.0


## Hardcore Beispiel mit Film-review daten mit NLTK
- https://www.nltk.org/book/ch06.html
- Wir nutzen nur noch die 100 häufigsten Wörter in den Texten und schauen ob sie bei positiv oder negativ vorkommen

In [64]:
import random
import nltk

In [56]:
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [62]:
(" ").join(documents[1][1])
# 1. Position: 0,1,2,3, etc. movie review iterator
# 2. Position: 0 review / 1 bewertung (pos or neg)

'p o s'

In [70]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words() if w.isalpha())
word_features = list(all_words)[:2000]
word_features[0:20]

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'to',
 'a',
 'church',
 'party',
 'drink',
 'and',
 'then',
 'drive',
 'they',
 'get',
 'into',
 'an',
 'accident',
 'one',
 'of']

In [71]:
def document_features(document): 
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [72]:
print(document_features(movie_reviews.words('pos/cv957_8737.txt'))) 

{'contains(plot)': True, 'contains(two)': True, 'contains(teen)': False, 'contains(couples)': False, 'contains(go)': False, 'contains(to)': True, 'contains(a)': True, 'contains(church)': False, 'contains(party)': False, 'contains(drink)': False, 'contains(and)': True, 'contains(then)': True, 'contains(drive)': False, 'contains(they)': True, 'contains(get)': True, 'contains(into)': True, 'contains(an)': True, 'contains(accident)': False, 'contains(one)': True, 'contains(of)': True, 'contains(the)': True, 'contains(guys)': False, 'contains(dies)': False, 'contains(but)': True, 'contains(his)': True, 'contains(girlfriend)': True, 'contains(continues)': False, 'contains(see)': False, 'contains(him)': True, 'contains(in)': True, 'contains(her)': False, 'contains(life)': False, 'contains(has)': True, 'contains(nightmares)': False, 'contains(what)': True, 'contains(s)': True, 'contains(deal)': False, 'contains(watch)': True, 'contains(movie)': True, 'contains(sorta)': False, 'contains(find)':

In [73]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [74]:
classifier.classify(document_features("a movie with bad actors".split(" ")))

'neg'

In [75]:
classifier.classify(document_features("an uplifting movie with russel crowe".split(" ")))

'neg'

In [76]:
classifier.show_most_informative_features(10)

Most Informative Features
    contains(schumacher) = True              neg : pos    =      7.3 : 1.0
        contains(suvari) = True              neg : pos    =      6.9 : 1.0
        contains(shoddy) = True              neg : pos    =      6.9 : 1.0
          contains(mena) = True              neg : pos    =      6.9 : 1.0
        contains(turkey) = True              neg : pos    =      6.4 : 1.0
       contains(jumbled) = True              neg : pos    =      6.2 : 1.0
     contains(atrocious) = True              neg : pos    =      6.1 : 1.0
    contains(ridiculous) = True              neg : pos    =      5.8 : 1.0
       contains(unravel) = True              pos : neg    =      5.8 : 1.0
           contains(ugh) = True              neg : pos    =      5.7 : 1.0
