# Data Importing

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
train_df = pd.read_csv('/content/drive/MyDrive/NLPDT/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/NLPDT/test.csv')

# Package Importing

In [4]:
import os
import pandas as pd
import numpy as np
import re
import time
import emoji
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


# Preprocessing

### Preprocessing with Baseline Method

In [7]:
train_df = train_df.drop_duplicates('text')
len(train_df)

7503

In [8]:
def removeEmoji(text):
    return emoji.replace_emoji(text, '')

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def tweet_cleaner(text):
    text = text.lower() #convert to lowercase
    text = text.translate(str.maketrans("","", string.punctuation)) #remove punctuation
    text = removeEmoji(text) #remove emoji
    tk = WhitespaceTokenizer()  #tokenize text to list of words without space
    textsplit = tk.tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tag = nltk.pos_tag(textsplit) #get tags (noun, verb...)

    result = []
    for i,w in enumerate(tag):
        word = w[0]
        tag = w[1]
        if word not in stopwords.words("english") and not word.startswith("http") and "\\" not in word and '#' not in word and '@' not in word:
            result.append(lemmatizer.lemmatize(word,get_wordnet_pos(tag))) #lemmartize word to current tense

    text = ' '.join(result) #join back the result
    return text

### Preprocessing with Map

In [9]:
import time

t = time.process_time()

result = []
for index, tweet in enumerate(train_df['text']):
    tweet = tweet_cleaner(tweet)
    result.append(tweet)

elapsed_time = time.process_time() - t
print("time: %0.10f" % elapsed_time)

time: 15.4237315220


In [10]:
import time

t = time.process_time()

result = list(map(lambda x: tweet_cleaner(x), train_df['text']))

elapsed_time = time.process_time() - t
print("time: %0.10f" % elapsed_time)

time: 16.1167844240


In [11]:
%time result = list(map(lambda x: tweet_cleaner(x), train_df['text']))

CPU times: user 14.1 s, sys: 1.19 s, total: 15.3 s
Wall time: 15.8 s


### Preprocessing with Decreased Functions Call

In [None]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def tweet_cleaner(text):
    text = text.lower() #convert to lowercase
    text = text.translate(str.maketrans("","", string.punctuation)) #remove punctuation
    text = emoji.replace_emoji(text, '') #remove emoji
    tk = WhitespaceTokenizer()  #tokenize text to list of words without space
    textsplit = tk.tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tag = nltk.pos_tag(textsplit) #get tags (noun, verb...)

    result = []
    for i,w in enumerate(tag):
        word = w[0]
        tag = w[1]
        if word not in stopwords.words("english") and not word.startswith("http") and "\\" not in word and '#' not in word and '@' not in word:
            result.append(lemmatizer.lemmatize(word,get_wordnet_pos(tag))) #lemmartize word to current tense

    text = ' '.join(result) #join back the result
    return text

In [None]:
%time result = list(map(lambda x: tweet_cleaner(x), train_df['text']))

CPU times: user 20.9 s, sys: 1.74 s, total: 22.6 s
Wall time: 22.8 s


In [None]:
def tweet_cleaner(text):
    text = text.lower() #convert to lowercase
    text = text.translate(str.maketrans("","", string.punctuation)) #remove punctuation
    text = emoji.replace_emoji(text, '') #remove emoji
    tk = WhitespaceTokenizer()  #tokenize text to list of words without space
    textsplit = tk.tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tag = nltk.pos_tag(textsplit) #get tags (noun, verb...)

    result = []
    for i,w in enumerate(tag):
        word = w[0]
        tag = w[1]
        if word not in stopwords.words("english") and not word.startswith("http") and "\\" not in word and '#' not in word and '@' not in word:
            if tag.startswith('J'):
                tag = wordnet.ADJ
            elif tag.startswith('V'):
                tag = wordnet.VERB
            elif tag.startswith('N'):
                tag = wordnet.NOUN
            elif tag.startswith('R'):
                tag = wordnet.ADV
            else:
                tag = wordnet.NOUN
            result.append(lemmatizer.lemmatize(word, pos = tag)) #lemmartize word to current tense

    text = ' '.join(result) #join back the result
    return text

In [None]:
%time result = list(map(lambda x: tweet_cleaner(x), train_df['text']))

CPU times: user 20.4 s, sys: 1.65 s, total: 22 s
Wall time: 24.4 s


### Preprocessing with Multiprocessing

In [None]:
from multiprocessing.pool import Pool

def do_work(text):

    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    text = text.lower() #convert to lowercase
    text = text.translate(str.maketrans("","", string.punctuation)) #remove punctuation
    text = emoji.replace_emoji(text, '') #remove emoji
    tk = WhitespaceTokenizer()  #tokenize text to list of words without space
    textsplit = tk.tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tag = nltk.pos_tag(textsplit) #get tags (noun, verb...)

    res = []
    for i,w in enumerate(tag):
        word = w[0]
        tag = w[1]
        if word not in stopwords.words("english") and not word.startswith("http") and "\\" not in word and '#' not in word and '@' not in word:
            res.append(lemmatizer.lemmatize(word,get_wordnet_pos(tag))) #lemmartize word to current tense

    text = ' '.join(res) #join back the result

    return text

def preprocess_text(data, num_processes):

    with Pool(num_processes) as p:
        results = p.map(do_work, data)

    return results

In [None]:
%time result = preprocess_text(train_df['text'], 4)

CPU times: user 151 ms, sys: 88 ms, total: 239 ms
Wall time: 20.7 s


In [None]:
train_df['text'] = result

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_df['text'].tolist(), train_df['target'].tolist(), test_size=0.2, random_state=42, shuffle=False)

# SVM Model

### Baseline Train Method

In [None]:
import math
from tqdm import tqdm

def dotProduct(d1, d2):
    result = 0
    for f, v in d1.items():
        for g, w in d2.items():
            if f == g:
                result += v * w
    return result

def increment(d1, scale, d2):
    d1_copy = d1.copy()
    for f, v in d2.items():
        if f in d1_copy:
            d1_copy[f] = d1_copy[f] + v * scale
        else:
            d1_copy[f] = v * scale
    return d1_copy

def multiplication(scale, d1):
    d1_copy = d1.copy()
    for f, v in d1_copy.items():
        d1_copy[f] = v * scale
    return d1_copy

def pegasos_advanced(X, y, lbd, max_epochs=15):
    s = 1
    W = {}
    t = 2
    for i in tqdm(range(max_epochs)):
        for j in range(len(y)):
            xj = X[j]
            yj = y[j]
            eta = 1 / (lbd * t)
            margin = s * yj * dotProduct(W, xj)
            if margin < 1:
                s = (1 - eta * lbd) * s
                W = increment(W, eta * yj / s, xj)
            else:
                s = (1 - eta * lbd) * s
            t += 1
        norm_w = s ** 2 * dotProduct(W, W)
        if norm_w <= 1 / math.sqrt(lbd):
            break
    W = multiplication(s, W)
    return W


In [None]:
%%time
W = pegasos_advanced(X_train, y_train, 0.01)

100%|██████████| 15/15 [12:16<00:00, 49.07s/it]

CPU times: user 12min 12s, sys: 1.35 s, total: 12min 14s
Wall time: 12min 16s





### Train Method with Vectorization

In [None]:
def dotProduct(d1, d2):
    if len(d1) < len(d2):
        return dotProduct(d2, d1)
    else:
        return sum(d1.get(f, 0) * v for f, v in d2.items())

def increment(d1, scale, d2):
    for f, v in d2.items():
        d1[f] = d1.get(f, 0) + v * scale

def multiplication(scale, d1):
    for f, v in d1.items():
        d1[f] = d1.get(f, 0) * scale

def pegasos_advanced(X, y, lbd, max_epochs=15):
    y = np.array(y)  # Convert y to a NumPy array
    s = 1
    W = {}
    t = 2
    for i in tqdm(range(max_epochs)):
        for j in range(len(y)):
            xj = X[j]
            yj = y[j]
            eta = 1 / (lbd * t)
            margin = s * yj * dotProduct(W, xj)
            if margin < 1:
                s = (1 - eta * lbd) * s
                increment(W, eta * yj / s, xj)
            else:
                s = (1 - eta * lbd) * s
            t += 1
        norm_w = s ** 2 * dotProduct(W, W)
        if norm_w <= 1 / math.sqrt(lbd):
            break
    multiplication(s, W)
    return W

In [None]:
%%time
W = pegasos_advanced(X_train, y_train, 0.01)

100%|██████████| 15/15 [00:01<00:00, 13.08it/s]

CPU times: user 1.15 s, sys: 6 ms, total: 1.15 s
Wall time: 1.16 s





### Train Method with Reduced Loop and Vectorizaiton

In [None]:
import math
import numpy as np
from tqdm import tqdm
import itertools

def dotProduct(d1, d2):
    return sum(d1.get(f, 0) * v for f, v in d2.items())

def increment(d1, scale, d2):
    for f, v in d2.items():
        d1[f] = d1.get(f, 0) + v * scale

def multiplication(scale, d1):
    for f in d1:
        d1[f] *= scale

def pegasos_advanced(X, y, lbd, max_epochs=15):
    y = np.array(y)  # Convert y to a NumPy array
    s = 1
    W = {}
    t = 2
    data = zip(itertools.cycle(X), itertools.cycle(y))
    for _ in tqdm(range(max_epochs * len(y))):
        xj, yj = next(data)
        eta = 1 / (lbd * t)
        margin = s * yj * dotProduct(W, xj)
        if margin < 1:
            s = (1 - eta * lbd) * s
            increment(W, eta * yj / s, xj)
        else:
            s = (1 - eta * lbd) * s
        t += 1
        if t > max_epochs * len(y):
            break
    multiplication(s, W)
    return W

In [None]:
%%time
W = pegasos_advanced(X_train, y_train, 0.01)

100%|█████████▉| 90028/90030 [00:01<00:00, 82221.77it/s]

CPU times: user 1.09 s, sys: 8.01 ms, total: 1.1 s
Wall time: 1.11 s





# Predicting

In [None]:
def sign(x):
    return 1 if x >= 0 else -1


def test(W, X_test, y_test):
    return sum(1 for x, y_true in zip(X_test, y_test) if y_true == sign(dotProduct(W, x))) / len(y_test)

In [None]:
test(W, X_test, y_test)

0.760159893404397