# Navier Bayes (mit Worthäufigkeiten)

In [4]:
import pandas as pd

# Daten von: https://www.kaggle.com/uciml/sms-spam-collection-dataset
# Aufbereitet wie folgt:
#  - 2 "Unnamed" Spalten entfernt
#  - Kodierung auf utf-8 geändert
#  - Spalten unbenannt

df = pd.read_csv("data/spam.csv")

df.head()

Unnamed: 0,type,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
print(len(df))

5572


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

X = df["message"]
y = df["type"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

In [7]:
#Anzeigen aller Texte 
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will ?_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: message, Length: 5572, dtype: object


In [8]:
#Anzeigen  der Klassenzuordnung
print(X_train)

872                       I'll text you when I drop x off
831     Hi mate its RV did u hav a nice hol just a mes...
1273    network operator. The service is free. For T &...
3314    FREE MESSAGE Activate your 500 FREE Text Messa...
4929    Hi, the SEXYCHAT girls are waiting for you to ...
                              ...                        
4931                Match started.india  &lt;#&gt;  for 2
3264    44 7732584351, Do you want a New Nokia 3510i c...
1653    I was at bugis juz now wat... But now i'm walk...
2607    :-) yeah! Lol. Luckily i didn't have a starrin...
2732    How dare you stupid. I wont tell anything to y...
Name: message, Length: 4179, dtype: object


In [9]:
#Jetzt wird eine Invertierung durchgeführt. Alle Worte werden 
#werden bestimmt. Matrix: Welches Wort kommt in welchem Text vor. 
#cv = CountVectorizer()
#Der Count Vectorizer kann limitieren: 
cv = CountVectorizer(min_df = 0.001, max_df = 0.25)
cv = CountVectorizer(max_features = 1000)
cv = CountVectorizer()
cv.fit(X_train)
X_train = cv.transform(X_train)
X_test = cv.transform(X_test)

In [10]:
#Wieviele Zeilen und Spalten hat die Matrix
print(X_test.shape)

(1393, 7323)


In [11]:
print(X_train)

  (0, 2343)	1
  (0, 3921)	1
  (0, 4611)	1
  (0, 6408)	1
  (0, 7069)	1
  (0, 7289)	1
  (1, 805)	1
  (1, 1951)	1
  (1, 2191)	1
  (1, 2340)	1
  (1, 3170)	1
  (1, 3172)	1
  (1, 3210)	1
  (1, 3235)	1
  (1, 3273)	1
  (1, 3436)	1
  (1, 3555)	1
  (1, 3655)	1
  (1, 4128)	1
  (1, 4198)	1
  (1, 4493)	1
  (1, 4611)	1
  (1, 5468)	1
  (1, 5528)	1
  (1, 5529)	1
  :	:
  (4176, 6592)	1
  (4176, 6954)	1
  (4176, 6980)	1
  (4176, 6987)	1
  (4176, 7073)	1
  (4177, 2193)	1
  (4177, 3171)	1
  (4177, 3872)	1
  (4177, 3943)	1
  (4177, 4008)	1
  (4177, 5477)	1
  (4177, 6080)	1
  (4177, 7262)	1
  (4177, 7289)	1
  (4178, 791)	1
  (4178, 932)	1
  (4178, 2062)	1
  (4178, 3191)	1
  (4178, 3327)	1
  (4178, 6178)	1
  (4178, 6320)	1
  (4178, 6378)	1
  (4178, 6546)	2
  (4178, 7168)	2
  (4178, 7289)	3


In [12]:
cv.get_feature_names()

AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names'

In [13]:
cv.vocabulary_

{'ll': 3921,
 'text': 6408,
 'you': 7289,
 'when': 7069,
 'drop': 2343,
 'off': 4611,
 'hi': 3235,
 'mate': 4128,
 'its': 3555,
 'rv': 5528,
 'did': 2191,
 'hav': 3170,
 'nice': 4493,
 'hol': 3273,
 'just': 3655,
 'message': 4198,
 'say': 5589,
 'hello': 3210,
 'coz': 1951,
 'haven': 3172,
 'sent': 5681,
 'in': 3436,
 'ages': 805,
 'started': 6084,
 'driving': 2340,
 'so': 5931,
 'stay': 6095,
 'roads': 5468,
 'rvx': 5529,
 'network': 4475,
 'operator': 4673,
 'the': 6431,
 'service': 5693,
 'is': 3539,
 'free': 2814,
 'for': 2775,
 'visit': 6898,
 '80488': 595,
 'biz': 1293,
 'activate': 745,
 'your': 7294,
 '500': 492,
 'messages': 4200,
 'by': 1501,
 'replying': 5386,
 'to': 6546,
 'this': 6467,
 'with': 7133,
 'word': 7176,
 'terms': 6394,
 'conditions': 1863,
 'www': 7227,
 '07781482378': 29,
 'com': 1809,
 'sexychat': 5708,
 'girls': 2965,
 'are': 978,
 'waiting': 6946,
 'them': 6438,
 'now': 4564,
 'great': 3056,
 'night': 4500,
 'chatting': 1653,
 'send': 5672,
 'stop': 6129,
 

In [14]:
#Auf die Term-Dokument-Matrix wird der Multinomilale Naive Bayes angewendet
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.9863603732950467


In [15]:
model.predict("Hello World")

ValueError: Expected 2D array, got scalar array instead:
array=Hello World.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [16]:
X_train[5]['message']

IndexError: Index dimension must be 1 or 2