In [1]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time
import pickle
pd.set_option('display.max_colwidth', 200)

In [2]:
# read data
train = pd.read_csv("train_2kmZucJ.csv")
test = pd.read_csv("test_oJQbWVk.csv")

In [3]:
train.shape, test.shape

((7920, 3), (1953, 2))

In [4]:
train['label'].value_counts(normalize = True)

0    0.744192
1    0.255808
Name: label, dtype: float64

In [5]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


In [6]:
# remove URL's from train and test
train['clean_tweet'] = train['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))
test['clean_tweet'] = test['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

In [7]:
# remove punctuation marks
punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~'
train['clean_tweet'] = train['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

# convert text to lowercase
train['clean_tweet'] = train['clean_tweet'].str.lower()
test['clean_tweet'] = test['clean_tweet'].str.lower()

# remove numbers
train['clean_tweet'] = train['clean_tweet'].str.replace("[0-9]", " ")
test['clean_tweet'] = test['clean_tweet'].str.replace("[0-9]", " ")

# remove whitespaces
train['clean_tweet'] = train['clean_tweet'].apply(lambda x:' '.join(x.split()))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ' '.join(x.split()))

In [8]:
# import spaCy's language model
nlp = spacy.load('en', disable=['parser', 'ner'])

In [9]:
# function to lemmatize text
def lemmatization(texts):
    output = []
    for i in texts:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output

train['clean_tweet'] = lemmatization(train['clean_tweet'])
test['clean_tweet'] = lemmatization(test['clean_tweet'])

In [10]:
train.sample(10)

Unnamed: 0,id,label,tweet,clean_tweet
5218,5219,1,These two iPhone apps made me realize that my love life Sucks http://tinyurl.com/yhxyp3t #iphone,these two iphone app make -PRON- realize that -PRON- love life suck iphone
5088,5089,0,Don't disturb... #busy #playing #tablet #samsung #black #white #kiddos #welovethem #ig #igers… http://instagram.com/p/c-g5zOgU19/,do not disturb ... busy play tablet samsung black white kiddo welovethem ig iger …
5908,5909,0,#youtube#subscribe #daily #vlogs #twitch #gaming #fun #ps4 #xbox #sony #games #destiny2 #dance #destiny #hypehttps://youtu.be/AC4URjpulYA,youtubesubscribe daily vlog twitch gaming fun ps xbox sony game destiny dance destiny hype
6403,6404,0,I wish i had these right now #starwars #battlefront #ps4 #sony #dontjudgeme http://tmblr.co/Z0GZ8y1yGR9V3,i wish i have these right now starwar battlefront ps sony dontjudgeme
4083,4084,0,Coffee and cooking with mum! and #thankful making #glutenfree #apple #chutneypic.twitter.com/Y4DKMZphBj,coffee and cooking with mum and thankful make glutenfree apple chutneypic.twitter.comy dkmzphbj
2028,2029,1,@EconBizFin time to rebel against their constant product launches and upgrades #apple,econbizfin time to rebel against -PRON- constant product launch and upgrade apple
1665,1666,0,Get A #FREE #iPhone 8 From #Sprint and AT&T #apple #iphone8 #bestdeal #gooddeal #bogo #att #great #tech #buy https://techinwire.com/2017/09/19/get-a-free-iphone-8-from-sprint-and-att/ …pic.twitter...,get a free iphone from sprint and att apple iphone bestdeal gooddeal bogo att great tech buy … pic.twitter.comyigifr tbg
4807,4808,1,Hayden's Iphone could not be updated after two fucking hours! #APPLE,hayden 's iphone could not be update after two fucking hour apple
1297,1298,0,"We would like to wish you an amazing day! Make every minute count #tls #today #iphone #accessories #news #life October 09, 2017 at 07…","-PRON- would like to wish -PRON- an amazing day make every minute count tls today iphone accessory news life october , at …"
7168,7169,1,"The new #Facebook design is giving me migraine. Learn from #Apple, pls. Make your changes meaningful and useful to your users.","the new facebook design be give -PRON- migraine . learn from apple , pls . make -PRON- change meaningful and useful to -PRON- user ."


In [11]:
import tensorflow_hub as hub
import tensorflow as tf

  from ._conv import register_converters as _register_converters
W0604 21:16:43.310985 14428 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [14]:
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

W0604 21:24:47.860374 14428 deprecation.py:323] From C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


In [15]:
# just a random sentence
x = ["Roasted ants are a popular snack in Columbia"]
# Extract ELMo features
embeddings = elmo(x, signature="default", as_dict=True)["elmo"]
embeddings.shape

TensorShape([Dimension(1), Dimension(8), Dimension(1024)])

In [18]:
def elmo_vectors(x):
  embeddings = elmo(x.tolist(), signature="default", as_dict=True)["elmo"]
  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    # return average of ELMo features
    return sess.run(tf.reduce_mean(embeddings,1))

list_train = [train[i:i+100] for i in range(0,train.shape[0],100)]
list_test = [test[i:i+100] for i in range(0,test.shape[0],100)]
# Extract ELMo embeddings
elmo_train = [elmo_vectors(x['clean_tweet']) for x in list_train]
elmo_test = [elmo_vectors(x['clean_tweet']) for x in list_test]

In [19]:
elmo_train_new = np.concatenate(elmo_train, axis = 0)
elmo_test_new = np.concatenate(elmo_test, axis = 0)

# save elmo_train_new
pickle_out = open("elmo_train_03032019.pickle","wb")
pickle.dump(elmo_train_new, pickle_out)
pickle_out.close()

# save elmo_test_new
pickle_out = open("elmo_test_03032019.pickle","wb")
pickle.dump(elmo_test_new, pickle_out)
pickle_out.close()

In [20]:
# load elmo_train_new
pickle_in = open("elmo_train_03032019.pickle", "rb")
elmo_train_new = pickle.load(pickle_in)
# load elmo_train_new
pickle_in = open("elmo_test_03032019.pickle", "rb")
elmo_test_new = pickle.load(pickle_in)

In [21]:
from sklearn.model_selection import train_test_split
xtrain, xvalid, ytrain, yvalid = train_test_split(elmo_train_new, train['label'], random_state=42, test_size=0.2)

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

lreg = LogisticRegression()
lreg.fit(xtrain, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [23]:
preds_valid = lreg.predict(xvalid)

In [24]:
f1_score(yvalid, preds_valid)

0.7752675386444708

In [25]:
# make predictions on test set
preds_test = lreg.predict(elmo_test_new)

In [26]:
# prepare submission dataframe
sub = pd.DataFrame({'id':test['id'], 'label':preds_test})

# write predictions to a CSV file
sub.to_csv("sub_lreg.csv", index=False)