In [1]:
import pandas as pd
import nltk
import re
import pickle
from nltk.corpus import words
import os
import numpy as np

In [2]:
data_path = "train_tweets.csv"

In [3]:
df = pd.read_csv(data_path, encoding="ISO-8859-1")

In [4]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df)

del df

In [5]:
df_test[df_test.label == 1]

Unnamed: 0,id,label,tweet
31929,31930,1,did keep #colinpowell and #condoleezzarice fr...
4611,4612,1,new york bistro faces shocking accusations: ho...
6723,6724,1,"@user #in2017iwantto work at seeing, naming an..."
8426,8427,1,we cannot address issues of fair trade or fres...
5151,5152,1,"keep in mind @user is totally not , but all wh..."
2657,2658,1,"it does not help when a person comments ""i don..."
21615,21616,1,@user @user if you are going to call me a the...
14821,14822,1,"@user #mediamisogyny, russian interference, c..."
29398,29399,1,yeah man fuck feminism
23827,23828,1,@user so sorry


In [6]:
df_train[df_train.label == 1]

Unnamed: 0,id,label,tweet
8290,8291,1,@user @user @user haha i get it
29899,29900,1,meanwhile german #police can also not be disti...
18936,18937,1,@user is whiter than kevin federline
3687,3688,1,"police, check cashing industry, atlanta and pr..."
15572,15573,1,"#jeffsessions is , #bigoted and #notfit for th..."
24164,24165,1,@user @user @user @user arabs' : islamofas...
5436,5437,1,"@user that whole #russia #trump #tiesto , #whi..."
26450,26451,1,@user #allahsoil enlightenment is wasted on th...
8247,8248,1,as a student in 89/90 in a #gay bar i stood ne...
27625,27626,1,@user @user you want to call me a so i am hel...


In [7]:
tweets_train = df_train["tweet"].values
tweets_test = df_test["tweet"].values

In [9]:
PAD = "#PAD#"
UNK = "#UNK#"
lemmetizer = nltk.stem.WordNetLemmatizer()

def preprocess_text(tweets):
    # to lower case
    text = "\n".join(list(tweets)).lower()
    # remove number and punctuations
    text = re.sub(r'[^a-z \n]+', '', text).strip()
    # remove multiple spaces with single space
    text = re.sub(' +', ' ', text).strip()
    return text

def create_data(text):
    data = []
    for sentence in text.split("\n"):
        temp = []
        for word in sentence.split(" "):
            word = word.strip()
            if word == "":
                continue
            if word not in stop_words:
                temp.append(word)
        data.append(lemmetizer.lemmatize(" ".join(temp)).split(" "))
    return data

def process_and_pad_data(data, max_len, vocab):
    X = np.zeros(shape=(len(data), max_len))
    for ind_r, row in enumerate(data):
        X_row = np.zeros(shape=(max_len, ))
        for ind_c, word in enumerate(row[:max_len]):
            X_row[ind_c] = vocab.index(word) if word in vocab else vocab.index(UNK)
        X[ind_r] = X_row
    return X

In [10]:
text_train = preprocess_text(tweets_train)
text_test = preprocess_text(tweets_test)

In [11]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
stop_words = list(ENGLISH_STOP_WORDS)

In [12]:
word_stats = {}
for sentence in text_train.split("\n"):
    for word in sentence.split(" "):
        word = word.strip()
        if word == "":
            continue
        if word not in word_stats.keys():
            word_stats[word] = 0
        word_stats[word] += 1
reverse_word_stats = {}
for k, v in word_stats.items():
    reverse_word_stats[v] = k
print("Top 50 most occurence words in training set: ")
for occ in sorted(reverse_word_stats.keys(), reverse=True)[:50]:
    print(reverse_word_stats[occ], occ, "in stop words:", reverse_word_stats[occ] in stop_words)

Top 50 most occurence words in training set: 
user 13162 in stop words: False
the 7636 in stop words: True
to 7347 in stop words: True
a 4886 in stop words: True
i 4306 in stop words: True
you 4072 in stop words: True
and 3647 in stop words: True
in 3512 in stop words: True
for 3381 in stop words: True
of 3136 in stop words: True
is 3124 in stop words: True
my 2762 in stop words: True
love 2027 in stop words: False
on 1992 in stop words: True
this 1985 in stop words: True
it 1905 in stop words: True
with 1875 in stop words: True
be 1848 in stop words: True
day 1691 in stop words: False
so 1451 in stop words: True
all 1442 in stop words: True
that 1390 in stop words: True
are 1366 in stop words: True
me 1361 in stop words: True
happy 1244 in stop words: False
at 1237 in stop words: True
your 1233 in stop words: True
have 1205 in stop words: True
amp 1202 in stop words: False
just 1026 in stop words: False
its 980 in stop words: True
will 978 in stop words: True
we 947 in stop words: Tru

In [13]:
stop_words.append("user")
stop_words.append("u")

In [14]:
del word_stats, reverse_word_stats

In [15]:
data_train = create_data(text_train)
data_test = create_data(text_test)

In [16]:
vocab = []
if "vocab.pkl" not in os.listdir():    
    words_ = lemmetizer.lemmatize(" ".join(words.words()).lower()).split(" ")
    words_ = sorted(list(set(words_)))
    vocab = [PAD, UNK]
    for sentence in data_train:
        for word in sentence:
            if word in words_ and word not in vocab:
                vocab.append(word)
    with open("vocab.pkl", "wb") as f:
        pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL)
else:
    with open("vocab.pkl", "rb") as f:
        vocab = pickle.load(f)

In [17]:
max_len = 20

In [18]:
data_train = process_and_pad_data(data_train, max_len, vocab)
data_test = process_and_pad_data(data_test, max_len, vocab)

In [19]:
data_train

array([[1.000e+00, 1.000e+00, 1.099e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+00, 1.000e+00, 1.210e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.800e+01, 1.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [2.026e+03, 2.027e+03, 1.544e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [6.034e+03, 1.000e+00, 1.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.791e+03, 3.383e+03, 4.256e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

In [20]:
labels_train = df_train["label"].values
labels_test = df_test["label"].values

In [21]:
import tensorflow as tf
keras = tf.keras
L = keras.layers
M = keras.models

In [22]:
EMBED_SIZE = 100
LSTM_UNITS = 256
DENSE_UNITS = 128

In [23]:
tf.reset_default_graph()

In [24]:
input_X = L.Input(shape=(max_len,))

embed_words = L.Embedding(len(vocab), EMBED_SIZE)
lstm_cell = L.LSTMCell(LSTM_UNITS)
lstm_layer = L.RNN(lstm_cell, return_state=True)

embed = embed_words(input_X)
output, state_h, state_c = lstm_layer(embed)

output = L.Dense(units=DENSE_UNITS, activation="elu")(output)
output = L.Dense(units=64, activation="elu")(output)
logits = L.Dense(units=1, activation="sigmoid")(output)

model = M.Model(inputs=[input_X], outputs=[logits])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 20)                0         
_________________________________________________________________
embedding (Embedding)        (None, 20, 100)           954300    
_________________________________________________________________
rnn (RNN)                    [(None, 256), (None, 256) 365568    
_________________________________________________________________
dense (Dense)                (None, 128)               32896     
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 1,361,085
Trainable params: 1,361,085
Non-trainable para

In [None]:
optimizer = keras.optimizers.Adam(lr=0.01)
loss = keras.losses.binary_crossentropy

model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

In [31]:
batch_size = 128
epochs = 20

In [32]:
if "model" not in os.listdir():
    os.mkdir("model")

checkpoints = keras.callbacks.ModelCheckpoint("model/weights.hdf5")

In [33]:
model.fit(data_train, labels_train, batch_size=batch_size, epochs=epochs, 
          validation_data=(data_test, labels_test), callbacks=[checkpoints])

Train on 23971 samples, validate on 7991 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x236be288a20>

In [8]:
model = M.load_model("model/weights.hdf5")

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [49]:
test_path = "test_tweets.csv"
test_df = pd.read_csv(test_path, encoding="ISO-8859-1")

In [50]:
test_tweets = test_df["tweet"].values

In [51]:
text_test = preprocess_text(test_tweets)

In [52]:
data_test = create_data(text_test)
data_test = process_and_pad_data(data_test, max_len, vocab)

In [53]:
data_test

array([[1.000e+00, 1.000e+00, 1.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [5.060e+02, 1.000e+00, 1.960e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [7.220e+02, 1.084e+03, 1.279e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [4.190e+02, 1.000e+00, 1.440e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.140e+02, 2.920e+02, 5.430e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.130e+02, 9.220e+02, 2.700e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

In [54]:
pred = model.predict(data_test)

In [55]:
pred[pred >= 0.35] = 1
pred[pred < 0.35] = 0

In [56]:
test_df["label"] = pred.astype("int32")

In [57]:
test_df

Unnamed: 0,id,tweet,label
0,31963,#studiolife #aislife #requires #passion #dedic...,0
1,31964,@user #white #supremacists want everyone to s...,1
2,31965,safe ways to heal your #acne!! #altwaystohe...,0
3,31966,is the hp and the cursed child book up for res...,0
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",0
5,31968,choose to be :) #momtips,0
6,31969,something inside me dies Ã°ÂÂÂ¦Ã°ÂÂÂ¿Ã¢ÂÂ...,0
7,31970,#finished#tattoo#inked#ink#loveitÃ¢ÂÂ¤Ã¯Â¸Â ...,0
8,31971,@user @user @user i will never understand why...,0
9,31972,#delicious #food #lovelife #capetown mannaep...,0


In [58]:
test_df = test_df.drop(columns=["tweet"])

In [59]:
test_df.to_csv("submission.csv", index=False)