In [29]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import nltk

In [7]:
tweets = pd.read_csv("../Datasets/tweets.csv", sep=",")

In [8]:
tweets.head()

Unnamed: 0,tweets,label
0,#FollowFriday @France_Inte @PKuchly57 @Milipol...,1
1,@Lamb2ja Hey James! How odd :/ Please call our...,1
2,@DespiteOfficial we had a listen last night :)...,1
3,@97sides CONGRATS :),1
4,yeaaaah yippppy!!! my accnt verified rqst has...,1


In [40]:
len(tweets)

10000

In [12]:
testsätze = [
    "The sun is shining",
    "The weather is sweet",
    "The sun is shining and the weather is sweet"
]

In [63]:
count = CountVectorizer()
docs = np.array(testsätze)
bag = count.fit_transform(docs)

### Vocabulary

In [64]:
print(count.vocabulary_) ## Eigentlich müssten gewisse Preprocessing-Schritte vorgelagert sein --> Stop Words entfernen, UserNames etc.

{'today': 7, 'it': 2, 'is': 1, 'raining': 5, 'outside': 4, 'the': 6, 'weather': 9, 'not': 3, 'very': 8, 'good': 0}


### Merkmalsvektoren

One-Hot Codierte Arrays, welche anzeigen wie häufig bestimmte Wörter in einem Satz/Post/Dokument vorkommen

#### Monogramme

In [65]:
print(count.vocabulary_)

{'today': 7, 'it': 2, 'is': 1, 'raining': 5, 'outside': 4, 'the': 6, 'weather': 9, 'not': 3, 'very': 8, 'good': 0}


In [66]:
print(bag.toarray()) ##Merkmalsvektoren für jeden Satz

[[0 1 1 0 0 1 0 1 0 0]
 [0 1 1 0 1 1 0 0 0 0]
 [1 1 0 1 0 0 1 1 1 1]]


##### Bigramme

Durch N-Gramme oder diesem Fall Bigramme ist es möglich dem Text deutlich mehr Informationen zu entnehmen, da Wort- und Satzbau mit in Bezug genommen wird. Das Modell wird dadurch natürlich komplexer.

In [67]:
count = CountVectorizer(ngram_range=(2,2))
docs = np.array(testsätze)
bag = count.fit_transform(docs)

In [68]:
print(count.vocabulary_)

{'today it': 7, 'it is': 2, 'is raining': 1, 'raining outside': 4, 'the weather': 5, 'weather today': 9, 'today is': 6, 'is not': 0, 'not very': 3, 'very good': 8}


In [69]:
print(bag.toarray())

[[0 1 1 0 0 0 0 1 0 0]
 [0 1 1 0 1 0 0 0 0 0]
 [1 0 0 1 0 1 1 0 1 1]]


### Beurteilung von Wortrelevanz

#### Tf-idf-Maß

In [22]:
from sklearn.feature_extraction.text import TfidfTransformer

In [23]:
tfidf = TfidfTransformer()
np.set_printoptions(precision=2) # 2 Nachkommastellen
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.58 0.   0.   0.58 0.58 0.   0.  ]
 [0.   0.   0.58 0.   0.   0.   0.58 0.58]
 [0.43 0.33 0.33 0.43 0.33 0.33 0.33 0.33]]


### Tokenizer

Überführung von Wörtern in einzelne Wortabschnitte. Der Tokenizer von Porter überführt Wörter in ihre ursprüngliche Form um Vokabular zu verkleinern. Entweder entfernen von pre-/suffixen oder verwenden des Worstamms so wie es im Lexikon steht (Lemmatisierung).

In [60]:
satz = "I want to go to the mall !"

satz.split(" ")

['I', 'want', 'to', 'go', 'to', 'the', 'mall', '!']

##### Stemming/Lemmatisierung

In [25]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run') ##Manchmal kommen nicht vorhandene Wörter als Ergebnis heraus --> Beispiel thus

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

Leider gibt es bei der nltk library keine deutsche Implementierung, das heißt es muss auf andere Tools/Libraries zurückgegriffen werden. Ein Stemming-Ansatz für deutsches Vokabular wird in folgendem beschrieben: https://textmining.wp.hs-hannover.de/Preprocessing.html

#### Stopwords

In [30]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tigi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [39]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

new_list = []
words = tokenizer_porter('a runner likes running and runs a lot')
for w in words:
    if w not in stop:
        new_list.append(w)
new_list

##oder:
[w for w in tokenizer_porter('a runner likes running and runs a lot') if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

### Machine Learning Pipeline

manueller Train-/Test Split

In [52]:
from sklearn.model_selection import train_test_split

In [53]:
X_train, X_test, y_train,y_test = train_test_split(tweets.tweets,tweets.label,test_size=0.2, shuffle=True)

In [79]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

params= {
    "logisticregression__penalty": ['l1', 'l2'], 
    "logisticregression__C": [1.0,10.0,100.0] 
}

pipeline = make_pipeline(
    TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None,ngram_range=(1,1),stop_words=stop,tokenizer=tokenizer_porter),
    LogisticRegression(random_state=0)
)

In [80]:
search = GridSearchCV(pipeline,params,cv=5)

In [81]:
search.fit(X_train,y_train)

Traceback (most recent call last):
  File "C:\Users\tigi\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\tigi\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\tigi\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\tigi\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\tigi\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  Fi



GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidfvectorizer',
                                        TfidfVectorizer(lowercase=False,
                                                        stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "yo

In [82]:
search.score(X_test,y_test)

0.9935

In [83]:
search.predict(["This Machine Learning course was very good"])

array([1], dtype=int64)

#### Fazit

Das Modell ist ganz gut geeignet für dieses spezielle Datenset, jedoch fehlen dem Modell das Grundverständnis für beispielsweise Verneinungen, Sarkasmus, Satzbau etc. Dies könnte mit erweiterten Methoden angepasst werden. 