# Tokenization

In [1]:
text = "This is a simple sentence."
text.split()

['This', 'is', 'a', 'simple', 'sentence.']

In [2]:
text.replace(".", " ").split()

['This', 'is', 'a', 'simple', 'sentence']

In [3]:
text = "This is a simple sentence, isn't it?"

In [4]:
from re import split
split(" |,|\?", text)

['This', 'is', 'a', 'simple', 'sentence', '', "isn't", 'it', '']

In [5]:
from nltk import word_tokenize

word_tokenize(text)

['This', 'is', 'a', 'simple', 'sentence', ',', 'is', "n't", 'it', '?']

# Stemming and Lemmatization

In [6]:
import nltk
# nltk.download("omw-1.4")
# nltk.download("punkt")
# nltk.download('averaged_perceptron_tagger')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

### Stemming

In [7]:
stemmer = PorterStemmer()

In [8]:
words = ["word", "words", "walk", "walks", "walking", "happy", "happier"]
[stemmer.stem(w) for w in words]

['word', 'word', 'walk', 'walk', 'walk', 'happi', 'happier']

In [9]:
stemmer.stem("was")

'wa'

In [10]:
stemmer.stem("better")

'better'

### Lemmatization

In [11]:
lemmatizer = WordNetLemmatizer()
[lemmatizer.lemmatize(w) for w in words]

['word', 'word', 'walk', 'walk', 'walking', 'happy', 'happier']

In [12]:
lemmatizer.lemmatize("happier", pos=wordnet.ADJ)

'happy'

In [13]:
lemmatizer.lemmatize("was", pos=wordnet.VERB)

'be'

In [14]:
print(wordnet.VERB)
lemmatizer.lemmatize("was", pos="v")

v


'be'

### Spaghetti of lexical categories

In [15]:
from pandas import DataFrame

DataFrame([["JJ", "ADJ", "a"],
           ["VB", "VERB", "v"],
           ["RB", "ADVERB", "r"],
           ["NN, PRP, etc.", "NOUN", "n"]],
          columns=["nltk.pos_tag", "wordnet.postag", "string parameter"])

Unnamed: 0,nltk.pos_tag,wordnet.postag,string parameter
0,JJ,ADJ,a
1,VB,VERB,v
2,RB,ADVERB,r
3,"NN, PRP, etc.",NOUN,n


In [16]:
class Lemmatizer(WordNetLemmatizer):
    
    def __init__(self):
        self.POS_DICT = {"J": "a",
                    "V": "v",
                    "N": "n",
                    "R": "r"
                   }
        
    def lemmatokenize(self, text):
        words = nltk.word_tokenize(text)
        keys = [tag[1][0] for tag in nltk.pos_tag(words)]
        pos_tags = [self.POS_DICT[k] if k in self.POS_DICT.keys() else "n" for k in keys]
        lemmas = [self.lemmatize(word, tag) for (word, tag) in zip(words, pos_tags)]
        return lemmas

In [17]:
texts = ['A ball fell from a tree?', "Knocking on heaven's door?",
         'Open the door please', 'Eye for an eye', 'Donald Trump is closing the gates']

text = " ".join(texts)
text

"A ball fell from a tree? Knocking on heaven's door? Open the door please Eye for an eye Donald Trump is closing the gates"

In [18]:
lemmatizer = Lemmatizer()
lemmatizer.lemmatokenize(text)

['A',
 'ball',
 'fell',
 'from',
 'a',
 'tree',
 '?',
 'Knocking',
 'on',
 'heaven',
 "'s",
 'door',
 '?',
 'Open',
 'the',
 'door',
 'please',
 'Eye',
 'for',
 'an',
 'eye',
 'Donald',
 'Trump',
 'be',
 'close',
 'the',
 'gate']

# Stopwords

In [19]:
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\witen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
stopwords.words("english")[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [21]:
stopwords.words("hungarian")[:10]

['a',
 'ahogy',
 'ahol',
 'aki',
 'akik',
 'akkor',
 'alatt',
 'által',
 'általában',
 'amely']

# Text Vectoriztaion

In [22]:
import numpy as np
import pandas as pd

In [23]:
texts_split = [t.split() for t in texts]
texts_split

[['A', 'ball', 'fell', 'from', 'a', 'tree?'],
 ['Knocking', 'on', "heaven's", 'door?'],
 ['Open', 'the', 'door', 'please'],
 ['Eye', 'for', 'an', 'eye'],
 ['Donald', 'Trump', 'is', 'closing', 'the', 'gates']]

In [24]:
lemmatized = [lemmatizer.lemmatokenize(t) for t in texts]
lemmatized

[['A', 'ball', 'fell', 'from', 'a', 'tree', '?'],
 ['Knocking', 'on', 'heaven', "'s", 'door', '?'],
 ['Open', 'the', 'door', 'please'],
 ['Eye', 'for', 'an', 'eye'],
 ['Donald', 'Trump', 'be', 'close', 'the', 'gate']]

In [25]:
tokens = word_tokenize(" ".join(texts).lower())
uniqs = list(set(tokens))
uniqs

['a',
 'the',
 '?',
 'gates',
 'trump',
 'closing',
 'open',
 'please',
 'door',
 'on',
 "'s",
 'is',
 'for',
 'fell',
 'heaven',
 'from',
 'donald',
 'an',
 'eye',
 'ball',
 'tree',
 'knocking']

In [26]:
mat = np.zeros(shape=(len(texts), len(uniqs)))
mat.shape

(5, 22)

In [27]:
for i in range(mat.shape[0]):
    for j in range(mat.shape[1]):
        count = [l.lower() for l in lemmatized[i]].count(uniqs[j])
        mat[i,j] = count

pd.DataFrame(mat, columns=uniqs)

Unnamed: 0,a,the,?,gates,trump,closing,open,please,door,on,...,for,fell,heaven,from,donald,an,eye,ball,tree,knocking
0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


## Sklearn's CountVectorizer

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize

In [29]:
texts

['A ball fell from a tree?',
 "Knocking on heaven's door?",
 'Open the door please',
 'Eye for an eye',
 'Donald Trump is closing the gates']

In [30]:
vectorizer = CountVectorizer(stop_words="english")
mat = vectorizer.fit_transform(texts)

pd.DataFrame(mat.toarray(), columns = vectorizer.get_feature_names_out())

Unnamed: 0,ball,closing,donald,door,eye,fell,gates,heaven,knocking,open,tree,trump
0,1,0,0,0,0,1,0,0,0,0,1,0
1,0,0,0,1,0,0,0,1,1,0,0,0
2,0,0,0,1,0,0,0,0,0,1,0,0
3,0,0,0,0,2,0,0,0,0,0,0,0
4,0,1,1,0,0,0,1,0,0,0,0,1


In [31]:
percent = (mat != 0).sum() / (mat.shape[0] * mat.shape[1])
f"{round(percent, 3)*100}% is not 0"

'21.7% is not 0'

# Predicting outcome (SVM)

In [94]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [87]:
df = pd.read_csv("BBC News Train.csv")
X = df["Text"]
Y = df["Category"]

In [88]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
print(X_train.shape, x_test.shape)

(1192,) (298,)


In [89]:
vectorizer = CountVectorizer(stop_words="english")

In [90]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [91]:
svm = SVC().fit(X_train, Y_train)

In [92]:
svm.score(X_train, Y_train)

0.99748322147651

In [93]:
svm.score(X_test, Y_test)

0.9395973154362416

In [95]:
nb = MultinomialNB().fit(X_train, Y_train)

In [97]:
nb.score(X_train, Y_train)

0.9958053691275168

In [98]:
nb.score(X_test, Y_test)

0.9731543624161074

In [100]:
test = pd.read_csv("BBC News Test.csv")

In [None]:
x_test = test["Text"]
y_test = test["Ca"]