In [254]:
import numpy as np
import pandas as pd

# Load the Data

In [255]:
data = pd.read_csv('train.En.csv')
test = pd.read_csv('task_A_En_test.csv')

In [256]:
data = data[['tweet', 'sarcastic']]
data.rename(columns={'tweet': 'text'}, inplace=True)
data['text'] = data['text'].astype('string')
data['sarcastic'] = data['sarcastic'].astype('int')
data.dropna(inplace=True)
data

Unnamed: 0,text,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1
...,...,...
3463,The population spike in Chicago in 9 months is...,0
3464,You'd think in the second to last English clas...,0
3465,I’m finally surfacing after a holiday to Scotl...,0
3466,Couldn't be prouder today. Well done to every ...,0


# Tokenize Data

In [257]:
from sklearn.model_selection import train_test_split
from collections import Counter
import re

In [258]:
alpha = re.compile('[^a-z ]')

In [355]:
class Tokenizer:
  def __init__(self, sentences):
    words = [w for s in self.get_words(sentences) for w in s]
    self.wtoi = {w:i for i,(w,c) in enumerate(Counter(words).most_common(2200)[200:])}
    self.n_words = len(self.wtoi)
  def get_words(self, sentences):
    a = [alpha.sub('', s.lower()).split() for s in sentences]
    return [[' '.join(p) for p in zip(ws, ws[1:])] if len(ws) > 2 else ' '.join(ws) for ws in a]
  def tokenize(self, sentences):
    vec = np.zeros((len(sentences), self.n_words))
    for i, s in enumerate(self.get_words(sentences)):
      for w in s:
        if w in self.wtoi:
          vec[i][self.wtoi[w]] += 1
    return vec

tz = Tokenizer(data['text'])
x_train, x_valid, y_train, y_valid = train_test_split(tz.tokenize(data['text']), data['sarcastic'], test_size = 0.1, random_state=42)

# Train Model

In [356]:
from sklearn.tree import DecisionTreeClassifier

In [357]:
model = DecisionTreeClassifier(random_state=42)
model.fit(x_train, y_train)
model.score(x_valid, y_valid)

0.6484149855907781

# Evaluate

In [358]:
from sklearn.metrics import f1_score
# Testing Randomness
f1_score(np.random.randint(0, 2, test['sarcastic'].shape), test['sarcastic'])

0.212624584717608

## Testing our model

In [359]:
f1_score(model.predict(x_train), y_train)

0.9103740296400847

In [360]:
f1_score(model.predict(x_valid), y_valid)

0.3146067415730337

In [361]:
f1_score(model.predict(tz.tokenize(test['text'])), test['sarcastic'])

0.34800838574423476