In [None]:
#Imports
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy

In [None]:
train_directory = "datasets/training/offenseval-training-v1.tsv"
print("Reading Dataset...")
train_data = pd.read_csv(train_directory, sep='\t', header=0)

Reading Dataset...


In [None]:
tweets = train_data[["tweet"]]
subtask_a_labels = train_data[["subtask_a"]]
subtask_b_labels = train_data.query("subtask_a == 'OFF'")[["subtask_b"]]
subtask_c_labels = train_data.query("subtask_b == 'TIN'")[["subtask_c"]]

clean_tweets = copy.deepcopy(tweets)

In [None]:
##PREPROCESSING##

In [None]:
import re
import nltk
nltk.download('punkt', 'stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
lancaster_stemmer = LancasterStemmer()
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to stopwords...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
def take_data_to_shower(tweet):
    noises = ['URL', '@USER', '\'ve', 'n\'t', '\'s', '\'m']

    for noise in noises:
        tweet = tweet.replace(noise, '')

    return re.sub(r'[^a-zA-Z]', ' ', tweet)


def tokenize(tweet):
    lower_tweet = tweet.lower()
    return word_tokenize(lower_tweet)


def remove_stop_words(tokens):
    clean_tokens = []
    stopWords = set(stopwords.words('english'))
    for token in tokens:
        if token not in stopWords:
            if token.replace(' ', '') != '':
                if len(token) > 1:
                    clean_tokens.append(token)
    return clean_tokens


def stem_and_lem(tokens):
    clean_tokens = []
    for token in tokens:
        token = wordnet_lemmatizer.lemmatize(token)
        token = lancaster_stemmer.stem(token)
        if len(token) > 1:
            clean_tokens.append(token)
    return clean_tokens


In [None]:
tqdm.pandas(desc="Cleaning Data Phase I...")
clean_tweets['tweet'] = tweets['tweet'].progress_apply(take_data_to_shower)

tqdm.pandas(desc="Tokenizing Data...")
clean_tweets['tokens'] = clean_tweets['tweet'].progress_apply(tokenize)

tqdm.pandas(desc="Cleaning Data Phase II...")
clean_tweets['tokens'] = clean_tweets['tokens'].progress_apply(remove_stop_words)

tqdm.pandas(desc="Stemming And Lemmatizing")
clean_tweets['tokens'] = clean_tweets['tokens'].progress_apply(stem_and_lem)

text_vector = clean_tweets['tokens'].tolist()


Cleaning Data Phase I...: 100%|██████████| 13240/13240 [00:00<00:00, 90838.52it/s]
Tokenizing Data...: 100%|██████████| 13240/13240 [00:02<00:00, 5726.47it/s]
Cleaning Data Phase II...: 100%|██████████| 13240/13240 [00:02<00:00, 5410.97it/s]
Stemming And Lemmatizing: 100%|██████████| 13240/13240 [00:05<00:00, 2476.73it/s]


In [None]:
##EMBEDDING##

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfid(text_vector):
    vectorizer = TfidfVectorizer()
    untokenized_data =[' '.join(tweet) for tweet in tqdm(text_vector, "Vectorizing...")]
    vectorizer = vectorizer.fit(untokenized_data)
    vectors = vectorizer.transform(untokenized_data).toarray()
    return vectors
  
def get_vectors(vectors, labels, keyword):
    if len(vectors) != len(labels):
        print("Unmatching sizes!")
        return
    result = list()
    for vector, label in zip(vectors, labels):
        if label == keyword:
            result.append(vector)
    return result

In [None]:
vectors_a = tfid(text_vector) # Numerical Vectors A
labels_a = subtask_a_labels['subtask_a'].values.tolist() # Subtask A Labels

vectors_b = get_vectors(vectors_a, labels_a, "OFF") # Numerical Vectors B
labels_b = subtask_b_labels['subtask_b'].values.tolist() # Subtask B Labels

vectors_c = get_vectors(vectors_b, labels_b, "TIN") # Numerical Vectors C
labels_c = subtask_c_labels['subtask_c'].values.tolist() # Subtask C Labels

Vectorizing...: 100%|██████████| 13240/13240 [00:00<00:00, 687301.48it/s]


In [None]:
##CLASSIFING##

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

def classify(vectors, labels, type="DT"):
    # Random Splitting With Ratio 3 : 1
    train_vectors, test_vectors, train_labels, test_labels = train_test_split(vectors, labels, test_size=0.25)

    # Initialize Model
    classifier = None
    if(type=="MNB"):
        classifier = MultinomialNB(alpha=0.7)
        classifier.fit(train_vectors, train_labels)
    elif(type=="KNN"):
        classifier = KNeighborsClassifier(n_jobs=4)
        params = {'n_neighbors': [3,5,7,9], 'weights':['uniform', 'distance']}
        classifier = GridSearchCV(classifier, params, cv=3, n_jobs=4)
        classifier.fit(train_vectors, train_labels)
        classifier = classifier.best_estimator_
    elif(type=="SVM"):
        classifier = SVC()
        classifier = GridSearchCV(classifier, {'C':[0.001, 0.01, 0.1, 1, 10]}, cv=3, n_jobs=4)
        classifier.fit(train_vectors, train_labels)
        classifier = classifier.best_estimator_
    elif(type=="DT"):
        classifier = DecisionTreeClassifier(max_depth=800, min_samples_split=5)
        params = {'criterion':['gini','entropy']}
        classifier = GridSearchCV(classifier, params, cv=3, n_jobs=4)
        classifier.fit(train_vectors, train_labels)
        classifier = classifier.best_estimator_
    elif(type=="RF"):
        classifier = RandomForestClassifier(max_depth=800, min_samples_split=5)
        params = {'n_estimators': [n for n in range(50,200,50)], 'criterion':['gini','entropy'], }
        classifier = GridSearchCV(classifier, params, cv=3, n_jobs=4)
        classifier.fit(train_vectors, train_labels)
        classifier = classifier.best_estimator_
    elif(type=="LR"):
        classifier = LogisticRegression(multi_class='auto', solver='newton-cg',)
        classifier = GridSearchCV(classifier, {"C":np.logspace(-3,3,7), "penalty":["l2"]}, cv=3, n_jobs=4)
        classifier.fit(train_vectors, train_labels)
        classifier = classifier.best_estimator_
    else:
        print("Wrong Classifier Type!")
        return

    accuracy = accuracy_score(train_labels, classifier.predict(train_vectors))
    print("Training Accuracy:", accuracy)
    test_predictions = classifier.predict(test_vectors)
    accuracy = accuracy_score(test_labels, test_predictions)
    print("Test Accuracy:", accuracy)
    print("Confusion Matrix:", )
    print(confusion_matrix(test_labels, test_predictions))

In [None]:
print("\nBuilding Model Subtask A...")
classify(vectors_a[:], labels_a[:], "SVM") # {MNB, KNN, SVM, DT, RF, LR}


Building Model Subtask A...




Training Accuracy: 0.66
Test Accuracy: 0.664
Confusion Matrix:
[[166   0]
 [ 84   0]]


In [None]:
print("\nBuilding Model Subtask B...")
classify(vectors_b[:], labels_b[:], "SVM") # {MNB, KNN, SVM, DT, RF, LR}


Building Model Subtask B...




Training Accuracy: 0.8773333333333333
Test Accuracy: 0.884
Confusion Matrix:
[[221   0]
 [ 29   0]]


In [None]:
print("\nBuilding Model Subtask C...")
classify(vectors_c[:], labels_c[:], "SVM") # {MNB, KNN, SVM, DT, RF, LR}


Building Model Subtask C...




Training Accuracy: 0.628
Test Accuracy: 0.604
Confusion Matrix:
[[  0  70   0]
 [  0 151   0]
 [  0  29   0]]
