# Tokenization

In [17]:
for i in "Ez egy szöveg.":
    print(i)

E
z
 
e
g
y
 
s
z
ö
v
e
g
.


In [9]:
[1, 2, 3, 4][0]+1

2

In [1]:
text = "This is a simple sentence."
text.split()

['This', 'is', 'a', 'simple', 'sentence.']

In [2]:
text.replace(".", " ").split()

['This', 'is', 'a', 'simple', 'sentence']

In [3]:
text = "This is a simple sentence, isn't it?"

In [4]:
from re import split
split(" |,|\?", text)

['This', 'is', 'a', 'simple', 'sentence', '', "isn't", 'it', '']

In [18]:
from nltk import word_tokenize

tokenized = word_tokenize(text)

In [20]:
tokenized.count("This")

1

In [22]:
word_tokenize("This is this is that".lower()).count("this")

2

# Stemming and Lemmatization

In [23]:
import nltk
# nltk.download("omw-1.4")
# nltk.download("punkt")
# nltk.download('averaged_perceptron_tagger')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

### Stemming

In [27]:
stemmer = PorterStemmer()

In [25]:
words = ["word", "words", "walk", "walks", "walking", "happy", "happier"]
[stemmer.stem(w) for w in words]

['word', 'word', 'walk', 'walk', 'walk', 'happi', 'happier']

In [9]:
stemmer.stem("was")

'wa'

In [10]:
stemmer.stem("better")

'better'

### Lemmatization

In [28]:
lemmatizer = WordNetLemmatizer()
[lemmatizer.lemmatize(w) for w in words]

['word', 'word', 'walk', 'walk', 'walking', 'happy', 'happier']

In [12]:
lemmatizer.lemmatize("happier", pos=wordnet.ADJ)

'happy'

In [30]:
lemmatizer.lemmatize("was", pos=wordnet.VERB)

'be'

In [31]:
print(wordnet.VERB)
lemmatizer.lemmatize("was", pos="v")

v


'be'

### Spaghetti of lexical categories

In [15]:
from pandas import DataFrame

DataFrame([["JJ", "ADJ", "a"],
           ["VB", "VERB", "v"],
           ["RB", "ADVERB", "r"],
           ["NN, PRP, etc.", "NOUN", "n"]],
          columns=["nltk.pos_tag", "wordnet.postag", "string parameter"])

Unnamed: 0,nltk.pos_tag,wordnet.postag,string parameter
0,JJ,ADJ,a
1,VB,VERB,v
2,RB,ADVERB,r
3,"NN, PRP, etc.",NOUN,n


In [32]:
class Lemmatizer(WordNetLemmatizer):
    
    def __init__(self):
        self.POS_DICT = {"J": "a",
                    "V": "v",
                    "N": "n",
                    "R": "r"
                   }
        
    def __call__(self, text):
        words = nltk.word_tokenize(text)
        keys = [tag[1][0] for tag in nltk.pos_tag(words)]
        pos_tags = [self.POS_DICT[k] if k in self.POS_DICT.keys() else "n" for k in keys]
        lemmas = [self.lemmatize(word, tag) for (word, tag) in zip(words, pos_tags)]
        return lemmas

In [34]:
texts = ['A ball fell from a tree?', "Knocking on heaven's door?",
         'Open the door please', 'Eye for an eye', 'Donald Trump is closing the gates']

text = " ".join(texts)
text

"A ball fell from a tree? Knocking on heaven's door? Open the door please Eye for an eye Donald Trump is closing the gates"

In [35]:
lemmatizer = Lemmatizer()
lemmatizer(text)

['A',
 'ball',
 'fell',
 'from',
 'a',
 'tree',
 '?',
 'Knocking',
 'on',
 'heaven',
 "'s",
 'door',
 '?',
 'Open',
 'the',
 'door',
 'please',
 'Eye',
 'for',
 'an',
 'eye',
 'Donald',
 'Trump',
 'be',
 'close',
 'the',
 'gate']

# Stopwords

In [38]:
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\witen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [48]:
stopwords.words("english")[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [46]:
stopwords.words("hungarian")[:10]

['a',
 'ahogy',
 'ahol',
 'aki',
 'akik',
 'akkor',
 'alatt',
 'által',
 'általában',
 'amely']

# Text Vectoriztaion

In [51]:
import numpy as np
import pandas as pd

In [52]:
texts_split = [t.split() for t in texts]
texts_split

[['A', 'ball', 'fell', 'from', 'a', 'tree?'],
 ['Knocking', 'on', "heaven's", 'door?'],
 ['Open', 'the', 'door', 'please'],
 ['Eye', 'for', 'an', 'eye'],
 ['Donald', 'Trump', 'is', 'closing', 'the', 'gates']]

In [60]:
texts_split[1]

['Knocking', 'on', "heaven's", 'door?']

In [54]:
lemmatized = [lemmatizer(t) for t in texts]
lemmatized

[['A', 'ball', 'fell', 'from', 'a', 'tree', '?'],
 ['Knocking', 'on', 'heaven', "'s", 'door', '?'],
 ['Open', 'the', 'door', 'please'],
 ['Eye', 'for', 'an', 'eye'],
 ['Donald', 'Trump', 'be', 'close', 'the', 'gate']]

In [55]:
tokens = word_tokenize(" ".join(texts).lower())
uniqs = list(set(tokens))
uniqs

['?',
 "'s",
 'open',
 'the',
 'gates',
 'door',
 'from',
 'eye',
 'ball',
 'fell',
 'for',
 'please',
 'trump',
 'heaven',
 'an',
 'closing',
 'tree',
 'a',
 'donald',
 'is',
 'knocking',
 'on']

In [56]:
mat = np.zeros(shape=(len(texts), len(uniqs)))
mat.shape

(5, 22)

In [57]:
for i in range(mat.shape[0]):
    for j in range(mat.shape[1]):
        count = [l.lower() for l in lemmatized[i]].count(uniqs[j])
        mat[i,j] = count

pd.DataFrame(mat, columns=uniqs)

Unnamed: 0,?,'s,open,the,gates,door,from,eye,ball,fell,...,trump,heaven,an,closing,tree,a,donald,is,knocking,on
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## Sklearn's CountVectorizer

In [61]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize

In [62]:
texts

['A ball fell from a tree?',
 "Knocking on heaven's door?",
 'Open the door please',
 'Eye for an eye',
 'Donald Trump is closing the gates']

In [63]:
vectorizer = CountVectorizer(stop_words="english")
mat = vectorizer.fit_transform(texts)

pd.DataFrame(mat.toarray(), columns = vectorizer.get_feature_names_out())

Unnamed: 0,ball,closing,donald,door,eye,fell,gates,heaven,knocking,open,tree,trump
0,1,0,0,0,0,1,0,0,0,0,1,0
1,0,0,0,1,0,0,0,1,1,0,0,0
2,0,0,0,1,0,0,0,0,0,1,0,0
3,0,0,0,0,2,0,0,0,0,0,0,0
4,0,1,1,0,0,0,1,0,0,0,0,1


In [64]:
percent = (mat != 0).sum() / (mat.shape[0] * mat.shape[1])
f"{round(percent, 3)*100}% is not 0"

'21.7% is not 0'

# Predicting outcome (SVM)

In [65]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

In [91]:
df = pd.read_csv("BBC News Train.csv")
X = df["Text"]
Y = df["Category"]

In [92]:
df.iloc[:,2].unique()

array(['business', 'tech', 'politics', 'sport', 'entertainment'],
      dtype=object)

In [93]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
print(X_train.shape, X_test.shape)

(1192,) (298,)


In [94]:
vectorizer = CountVectorizer(stop_words="english")

In [97]:
X_train_s = vectorizer.fit_transform(X_train)
X_test_s = vectorizer.transform(X_test)

In [98]:
svm = SVC(kernel="linear").fit(X_train_s, Y_train)

In [100]:
svm.score(X_train_s, Y_train)

1.0

In [102]:
svm.score(X_test_s, Y_test)

0.9798657718120806

In [104]:
y_pred = svm.predict(X_test_s)

pd.DataFrame(confusion_matrix(Y_test, y_pred, normalize="pred"),
             columns=df["Category"].unique(), index=df["Category"].unique())


Unnamed: 0,business,tech,politics,sport,entertainment
business,0.972222,0.0,0.018519,0.016949,0.0
tech,0.0,0.983607,0.018519,0.0,0.0
politics,0.027778,0.0,0.962963,0.0,0.0
sport,0.0,0.0,0.0,0.983051,0.0
entertainment,0.0,0.016393,0.0,0.0,1.0


## With lemmatization

In [127]:
from time import time

In [126]:
start = time()
vectorizer = CountVectorizer(stop_words="english", tokenizer=Lemmatizer())
X_train_l = vectorizer.fit_transform(X_train)
X_test_l = vectorizer.transform(X_test)
lemma_time = time() - start

28.326080322265625

In [129]:
(lemma_time * (15000/len(df)))/60

4.751677852348993

In [121]:
svm2 = SVC(kernel="linear").fit(X_train_l, Y_train)

In [122]:
svm2.score(X_train_l, Y_train)

1.0

In [139]:
svm2.score(X_test_l, Y_test)

0.9932885906040269

In [140]:
y_pred = svm2.predict(X_test_l)
pd.DataFrame(confusion_matrix(Y_test, y_pred, normalize="pred"),
                              columns=df["Category"].unique(), index=df["Category"].unique())

Unnamed: 0,business,tech,politics,sport,entertainment
business,0.986301,0.0,0.0,0.0,0.0
tech,0.0,1.0,0.0,0.0,0.0
politics,0.013699,0.0,1.0,0.016949,0.0
sport,0.0,0.0,0.0,0.983051,0.0
entertainment,0.0,0.0,0.0,0.0,1.0


## With stemming

In [112]:
class Stemmer(nltk.stem.porter.PorterStemmer):
    def __call__(self, text):
        tokens = word_tokenize(text)
        stemmed = [self.stem(t) for t in tokens]
        return stemmed

In [130]:
start = time()
vectorizer = CountVectorizer(stop_words="english", tokenizer=Stemmer())
X_train_st = vectorizer.fit_transform(X_train)
X_test_st = vectorizer.transform(X_test)
stopw_time = time() - start



In [132]:
(stopw_time * (15000/len(df)))/60

2.0235818104455934

In [133]:
df["Text"][0]



In [114]:
svm3 = SVC(kernel="linear").fit(X_train_st, Y_train)
svm3.score(X_train_st, Y_train)

1.0

In [115]:
svm3.score(X_test_st, Y_test)

0.9865771812080537

In [None]:
y_pred_s = svm3.predict(X_test_s)
confusion_matrix(Y_test, y_pred_s, normalize="pred")

# Predicting unlabeled data

In [141]:
test = pd.read_csv("BBC News Test.csv")

In [142]:
test

Unnamed: 0,ArticleId,Text
0,1018,qpr keeper day heads for preston queens park r...
1,1319,software watching while you work software that...
2,1138,d arcy injury adds to ireland woe gordon d arc...
3,459,india s reliance family feud heats up the ongo...
4,1020,boro suffer morrison injury blow middlesbrough...
...,...,...
730,1923,eu to probe alitalia state aid the european ...
731,373,u2 to play at grammy awards show irish rock ba...
732,1704,sport betting rules in spotlight a group of mp...
733,206,alfa romeos to get gm engines fiat is to sto...


In [144]:
test["Prediction"] = svm3.predict(vectorizer.transform(test["Text"]))

In [145]:
test

Unnamed: 0,ArticleId,Text,Prediction
0,1018,qpr keeper day heads for preston queens park r...,sport
1,1319,software watching while you work software that...,tech
2,1138,d arcy injury adds to ireland woe gordon d arc...,sport
3,459,india s reliance family feud heats up the ongo...,business
4,1020,boro suffer morrison injury blow middlesbrough...,sport
...,...,...,...
730,1923,eu to probe alitalia state aid the european ...,business
731,373,u2 to play at grammy awards show irish rock ba...,entertainment
732,1704,sport betting rules in spotlight a group of mp...,business
733,206,alfa romeos to get gm engines fiat is to sto...,business
