In [26]:
import re
import json

import pandas as pd
import spacy
from collections import Counter, OrderedDict
import re
import string
import warnings; warnings.simplefilter('ignore')

# NLTK imports
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

# SKLearn related imports
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn import preprocessing
from sklearn.model_selection import train_test_split


from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [27]:
df = pd.read_csv('data/train_data.csv')

In [28]:
df.label.value_counts()

irrelevant    31045
debate         8909
agree          3678
clickbait       840
Name: label, dtype: int64

In [29]:
df_upd = df[df.label != 'irrelevant']

In [30]:
df_upd.head()

Unnamed: 0,title,text,label
4,‘Jihadi John’: The Islamic State killer behind...,LONDON — The identity of the masked executione...,agree
8,Report: Taliban Detainee Swapped for Bowe Berg...,A Guantanamo Bay prisoner released last year a...,debate
9,Did Paul Rudd Help Take Down Dallas Airport Ho...,So… Rebecca Schoenkopf over at Wonkette is pre...,debate
10,US officials: Video shows American's execution,WARNING: GRAPHIC IMAGES. A masked militant cla...,debate
11,Was The Video Of That Homeless Man Doing Good ...,Awwwww!\nThis is such a heartwarming story!\nW...,clickbait


In [31]:
docs = df_upd['text']

In [32]:
train_df, validation_df = train_test_split(df_upd, test_size=0.2, random_state=42)

In [24]:
# Custom transformer to implement sentence cleaning
class TextCleanerTransformer(TransformerMixin):
    def __init__(self, tokenizer, stemmer, regex_list,
                 lower=True, remove_punct=True):
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self.regex_list = regex_list
        self.lower = lower
        self.remove_punct = remove_punct
        
    def transform(self, X, *_):
        X = list(map(self._clean_sentence, X))
        return X
    
    def _clean_sentence(self, sentence):
        
        # Replace given regexes
        for regex in self.regex_list:
            sentence = re.sub(regex[0], regex[1], sentence)
            
        # lowercase
        if self.lower:
            sentence = sentence.lower()

        # Split sentence into list of words
        words = self.tokenizer.tokenize(sentence)
            
        # Remove punctuation
        if self.remove_punct:
            words = list(filter(lambda x: x not in string.punctuation, words))

        # Stem words
        if self.stemmer:
            words = map(self.stemmer.stem, words)

        # Join list elements into string
        sentence = " ".join(words)
        
        return sentence
    
    def fit(self, *_):
        return self


In [33]:
tokenizer = WordPunctTokenizer()
stemmer = SnowballStemmer("english", ignore_stopwords=True)
regex_list = [("<[^>]*>", "")
             ]

cleaner = TextCleanerTransformer(tokenizer, stemmer, regex_list)
docs = cleaner.transform(train_df.text.values)

In [34]:
vectorizer = CountVectorizer(stop_words='english')

In [35]:
vectorizer.fit(docs)

# Looking at a small sample of the vocabulary:
vocabulary = list(vectorizer.vocabulary_.keys())
print("Small sample of the vocabulary:", vocabulary[0:20])

# Number of words in the vocabulary
print("\nNumber of distinct words:", len(vocabulary))

Small sample of the vocabulary: ['day', 'michael', 'brown', 'buri', 'ferguson', 'man', 'releas', 'claim', 'new', 'evid', 'case', 'cnn', 'report', 'audio', 'unnam', 'say', 'having', 'video', 'chat', 'woman']

Number of distinct words: 15428


In [36]:
sentence = docs[12:13]
print(sentence[0], '\n')

# Tranform sentence into bag of words representation
word_count_sentence = vectorizer.transform(sentence)

# Find the indexes of the words which appear in the sentence
_, columns = word_count_sentence.nonzero()

# Get the inverse map to map vector indexes to words
vocabulary = vectorizer.vocabulary_
inv_map = {v: k for k, v in vocabulary.items()}

# Extract the corresponding word and count
counts = [(inv_map[i], word_count_sentence[0, i]) for i in columns]

for word, count in counts:
    print(word, ": ", count)

the young soldier kill in a terror attack on the canadian parliament last night was a devot famili man and a career soldier who plan on becom a border guard by contrast his killer was a convict crimin who was on the terror watchlist and had his passport confisc it was chanc that brought the two men togeth when michael zehaf bibeau 32 launch his attack as corpor nathan cirillo 25 stood guard at ottawa ’ s war memori just metr away from the nation parliament zehaf bibeau shot corpor cirillo dead before run into parliament only to be kill himself by the sergeant at arm kevin vicker vicker sergeant at arm the hero of ottawa mr vicker has been hail as a hero after he dash into his offic to retriev a hand gun before shoot zehaf bibeau dead michael zehaf bibeau the gunman respons for the shoot imag credit cbcottawa michael zehaf bibeau shot dead nathan cirillo and storm parliament before being kill pictur from the twitter account of cbc ottawa canadian prime minist stephen harper declar the a

In [40]:
word_count_matrix = vectorizer.transform(df_upd['text'].values)
word_count_matrix.shape

(13427, 15428)

In [38]:
tfidf = TfidfTransformer()
tfidf.fit(word_count_matrix)

word_term_frequency_matrix = tfidf.transform(word_count_matrix)

In [39]:
vectorizer_123_grams = CountVectorizer(stop_words= 'english', ngram_range=(1,3))
vectorizer_123_grams.fit(docs)
word_count_matrix = vectorizer_123_grams.transform(df['text'].values)
word_count_matrix.shape

(44472, 450478)