# Imports

In [1]:
import numpy as np
import pandas as pd

# Load the Data

In [2]:
data = pd.read_csv('train.En.csv')
test = pd.read_csv('task_A_En_test.csv')

In [3]:
# We only need the text and whether it's sarcastic
data = data[['tweet', 'sarcastic']]
# Rename tweet to text to stay consistent with the test set
data.rename(columns={'tweet': 'text'}, inplace=True)
# Ensure datatypes are what we expect
data['text'] = data['text'].astype('string')
data['sarcastic'] = data['sarcastic'].astype('int')
data.dropna(inplace=True)
# Display the test data
data

Unnamed: 0,text,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1
...,...,...
3463,The population spike in Chicago in 9 months is...,0
3464,You'd think in the second to last English clas...,0
3465,I’m finally surfacing after a holiday to Scotl...,0
3466,Couldn't be prouder today. Well done to every ...,0


# Tokenize Data

In [4]:
from sklearn.model_selection import train_test_split
from collections import Counter
import re

In [5]:
# This regex selects anything that is not a lowercase alphabet or a space
alpha = re.compile('[^a-z ]')

In [6]:
class Tokenizer:
  def __init__(self, sentences):
    # gets every pair of words in the entire text
    words = [w for s in self.get_words(sentences) for w in s]
    # maps the 2200 most common bigram to a number
    # throws out the 200 most common though
    self.wtoi = {w:i for i,(w,c) in enumerate(Counter(words).most_common(2200)[200:])}
    # counts the number of words in the mapping
    self.n_words = len(self.wtoi)
  def get_words(self, sentences):
    # makes every sentence lowercase, removes all non-letters, and then splits based on spaces
    a = [alpha.sub('', s.lower()).split() for s in sentences]
    # gets every pair of words (bigrams) unless the tweet is one word in which case it's a unigram
    return [[' '.join(p) for p in zip(ws, ws[1:])] if len(ws) > 2 else ' '.join(ws) for ws in a]
  def tokenize(self, sentences):
    # Makes a vector object to hold the tokens
    vec = np.zeros((len(sentences), self.n_words))
    # This code count how many occurances of each bigram occur in the sentence
    # and increments the corresponding index in the vector based on self.wtoi
    for i, s in enumerate(self.get_words(sentences)):
      for w in s:
        if w in self.wtoi:
          vec[i][self.wtoi[w]] += 1
    # Returns the vector
    return vec

# Initialize the tokenizer
tz = Tokenizer(data['text'])
# Get our training and test data
x_train, x_valid, y_train, y_valid = train_test_split(tz.tokenize(data['text']), data['sarcastic'], test_size = 0.1, random_state=42)

# Train Model

In [7]:
# We are using a decision tree classifier for our model
from sklearn.tree import DecisionTreeClassifier

In [8]:
# Train it with a random state for consistency
model = DecisionTreeClassifier(random_state=42)
model.fit(x_train, y_train)
model.score(x_valid, y_valid)

0.6484149855907781

# Evaluate

In [9]:
# Import the f1 score metric
from sklearn.metrics import f1_score
# Testing Randomness
f1_score(np.random.randint(0, 2, test['sarcastic'].shape), test['sarcastic'])

0.21238938053097342

## Testing our model

In [10]:
# Score the training data
f1_score(model.predict(x_train), y_train)

0.9103740296400847

In [11]:
# Score the validation set
f1_score(model.predict(x_valid), y_valid)

0.3146067415730337

In [12]:
# Score the test set
f1_score(model.predict(tz.tokenize(test['text'])), test['sarcastic'])

0.34800838574423476

# Save what we got wrong

In [13]:
# Create a mask of every incorrect prediction
off = model.predict(tz.tokenize(test['text'])) != test['sarcastic']

In [14]:
# Save the incorrect predictions to a file
test[off].to_csv('wrong.csv')