In [4]:
import pandas as pd
import numpy as np
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer, RegexpStemmer
from string import punctuation

from collections import Counter

In [10]:
porter = PorterStemmer()
wnl = WordNetLemmatizer() 
stop = stopwords.words('english')

stop.append("!")
stop.append(',')
stop.append('')
stop.append('=')
stop = list(stop)
re = RegexpStemmer('[0-9]+')

# Preprocess text
def preprocess_text(string):
    # Annoying things!
    string = string.replace("=", "")
    string = string.replace("-", "")
    string = string.replace("'", "")
    tokens = word_tokenize(str(string))
    # Punctuations
    tokens = [token for token in tokens if token not in punctuation]
    # Stopwords
    tokens = [token for token in tokens if token not in stop]
    # Numbers
    tokens = [re.stem(token) for token in tokens]
    # convert to lowercase
    tokens = [token.lower() for token in tokens]
    return (tokens)

# Make unigrams and bygrams from a classified data 
def make_grams(classified_data):
    unigram_frequencies = Counter([])
    bigram_frequencies = Counter([])
    
    sentances = classified_data["comment_text"]
    for sentance in sentances:
        unigrams = ngrams(sentance, 1)
        bigrams = ngrams(sentance, 2)
        unigram_frequencies += Counter(unigrams)
        bigram_frequencies += Counter(bigrams)
    return (unigram_frequencies, bigram_frequencies)

In [141]:
train = pd.read_csv('test.csv', index_col=0)
train_labels = pd.read_csv('test_labels.csv', index_col=0)

# Process data!
train["comment_text"] = train.comment_text.apply(preprocess_text)
data = pd.concat([train, train_labels], axis=1)

# Take classified data!
"""
toxic_data = data[(data.toxic == 1)]
severe_toxic_data = data[(data.severe_toxic == 1)]
obscene_data = data[(data.obscene == 1)]
threat_data = data[(data.threat == 1)]
insult_data = data[(data.insult == 1)]
insult_data = data[(data.identity_hate == 1)]

# Make some grams to inspect the data
insult_unigram, insult_bigram = make_grams(insult_data)
threat_unigram, threat_bigram = make_grams(threat_data)
obscene_unigram, obscene_bigram = make_grams(obscene_data)
toxic_unigram, toxic_bigram = make_grams(toxic_data)
severe_toxic_unigram, severe_toxic_bigram = make_grams(severe_toxic_data)
"""

'\ntoxic_data = data[(data.toxic == 1)]\nsevere_toxic_data = data[(data.severe_toxic == 1)]\nobscene_data = data[(data.obscene == 1)]\nthreat_data = data[(data.threat == 1)]\ninsult_data = data[(data.insult == 1)]\ninsult_data = data[(data.identity_hate == 1)]\n\n# Make some grams to inspect the data\ninsult_unigram, insult_bigram = make_grams(insult_data)\nthreat_unigram, threat_bigram = make_grams(threat_data)\nobscene_unigram, obscene_bigram = make_grams(obscene_data)\ntoxic_unigram, toxic_bigram = make_grams(toxic_data)\nsevere_toxic_unigram, severe_toxic_bigram = make_grams(severe_toxic_data)\n'

In [157]:
def filter_comments(data, toxic = 0, severe_toxic = 0, obscene = 0, threat = 0, insult = 0, identity_hate = 0):
    condition = (data.toxic == toxic) & (data.severe_toxic == severe_toxic) & (data.obscene == obscene) & (data.threat == threat) & (data.insult == insult) & (data.identity_hate == identity_hate)
    return data[condition]
    
    

toxic_only = filter_comments(data, toxic = 1)
severe_toxic_only = filter_comments(data, severe_toxic = 1)
obscene_only = filter_comments(data, obscene = 1)
threat_only = filter_comments(data, threat = 1)
insult_only = filter_comments(data, insult = 1)
identity_hate_only = filter_comments(data, identity_hate = 1)
cleant_documents = filter_comments(data)

In [184]:
total_documents = len(data)

clean_document_count = len(cleant_documents)
toxic_document_count = len(toxic_only)
severe_toxic_document_count = len(severe_toxic_only)
obscene_document_count = len(obscene_only)
threat_document_count = len(threat_only)
insult_document_count = len(insult_only)
identity_hate_document_count = len(identity_hate_only)

print("Precentage of documents that belong to only ONE class")
print("============")
print("Total documents: ", total_documents)
print("Clean documents: ", round(clean_document_count / total_documents, 3),  " , count: ", clean_document_count)
print("============")
print("Only toxic documents: ", round(toxic_document_count / total_documents, 3),  " , count: ", toxic_document_count)
print("Only severe toxic documents: ", round(severe_toxic_document_count / total_documents, 3),  " , count: ", severe_toxic_document_count)
print("Only obscene documents: ", round(obscene_document_count / total_documents, 5),  " , count: ", obscene_document_count)
print("Only threath documents: ", round(threat_document_count / total_documents, 6),  " , count: ", threat_document_count)
print("Only insult documents: ", round(insult_document_count / total_documents, 5),  " , count: ", insult_document_count)
print("Only identity hate documents: ", round(identity_hate_document_count / total_documents, 5),  " , count: ", identity_hate_document_count)

Precentage of documents that belong to only ONE class
Total documents:  153164
Clean documents:  0.377  , count:  57735
Only toxic documents:  0.011  , count:  1710
Only severe toxic documents:  0.0  , count:  0
Only obscene documents:  0.00032  , count:  49
Only threath documents:  3.3e-05  , count:  5
Only insult documents:  0.00042  , count:  64
Only identity hate documents:  9e-05  , count:  14


In [188]:
insult_unigram, insult_bigram = make_grams(toxic_only)
insult_bigram.most_common(20)

[(('boob', 'boob'), 999),
 (('poop', 'poop'), 954),
 (('faggot', 'faggot'), 647),
 (('i', 'hate'), 496),
 (("''", "''"), 464),
 (('hate', 'you'), 464),
 (('die', 'die'), 455),
 (('you', 'i'), 453),
 (('``', "''"), 402),
 (('hate', 'hate'), 367),
 (('analanal', 'anal'), 350),
 (('anal', 'analanal'), 350),
 (('bums', 'bums'), 349),
 (('kill', 'yourself'), 247),
 (('yourself', 'kill'), 246),
 (('balls', 'balls'), 217),
 (('balls', 'ballsballs'), 215),
 (('ballsballs', 'balls'), 215),
 (('...', '...'), 213),
 (('anime', 'rules'), 185)]

In [94]:
import numpy as np
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [98]:
train = pd.read_csv('train.csv', index_col=0)

X = train.iloc[:, 0]
Y = train.iloc[:, 1:]

corpus = Corpus(X)

In [185]:
# Make a BOW with unique integers!
"""
word_to_ix = {}
for sent in X:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)


VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 31

# Assign a integer to a word
def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec.view(1, -1)

# Calculate the target
def make_target(label):
    tmp = 0
    for test in Y.iloc[49]:
        tmp += test
    return torch.LongTensor([0])

class BoWClassifier(nn.Module):  # inheriting from nn.Module!
    def __init__(self, num_labels, vocab_size):
        super(BoWClassifier, self).__init__()

        self.linear = nn.Linear(vocab_size, num_labels)

    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec), dim=1)

model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)


for index in range(len(X)):
    model.zero_grad()
    bow_vec = make_bow_vector(X.iloc[index], word_to_ix)
    target = make_target(Y.iloc[index])
    
    log_probs = model(bow_vec)
"""
# TO FIX!

RuntimeError: addmm(): argument 'mat1' (position 1) must be Variable, not torch.FloatTensor