In [2]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time
import pickle
pd.set_option('display.max_colwidth', 200)

In [3]:
# read data
train = pd.read_csv("train.csv", lineterminator='\n')
test = pd.read_csv("20190527_test.csv", lineterminator='\n')

In [4]:
train.shape, test.shape

((6331, 3), (2712, 2))

In [5]:
def make_label(df):
    df["label"] = df["label1"].apply(lambda x: 1 if x == 'Positive' else 0)
make_label(train)
train['label'].value_counts(normalize = True)

1    0.530564
0    0.469436
Name: label, dtype: float64

In [6]:
train.head()
test.head()

Unnamed: 0,id,tweet
0,1,masha allah ache cheez hai
1,2,Wazir e Mumlikat Saira Afzal K Walid Ko Shikast PTI K Mamoon Jaffar Kamyab
2,3,SelfieKing Ban Gia Dulha
3,4,Buhat he ache quality ke product hay.... i love daraz.pk
4,5,Hahahah :p naam letaa tu ziada ddoubt hootaa magaar yh bolkr direct attack krdia isnay :p


In [7]:
# remove URL's from train and test
train['clean_tweet'] = train['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))
test['clean_tweet'] = test['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

In [8]:
# remove punctuation marks
punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~'
train['clean_tweet'] = train['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

# convert text to lowercase
train['clean_tweet'] = train['clean_tweet'].str.lower()
test['clean_tweet'] = test['clean_tweet'].str.lower()

# remove numbers
train['clean_tweet'] = train['clean_tweet'].str.replace("[0-9]", " ")
test['clean_tweet'] = test['clean_tweet'].str.replace("[0-9]", " ")

# remove whitespaces
train['clean_tweet'] = train['clean_tweet'].apply(lambda x:' '.join(x.split()))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ' '.join(x.split()))

In [9]:
# import spaCy's language model
nlp = spacy.load('en', disable=['parser', 'ner'])

In [10]:
# function to lemmatize text
def lemmatization(texts):
    output = []
    for i in texts:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output

train['clean_tweet'] = lemmatization(train['clean_tweet'])
test['clean_tweet'] = lemmatization(test['clean_tweet'])

In [11]:
train.sample(10)

Unnamed: 0,id,tweet,label1,label,clean_tweet
3195,3194,khair mubarak.aur ap sb ko b EidMubarak.,Positive,1,khair mubarak.aur ap sb ko b eidmubarak .
1185,1186,boht acha moble hy,Positive,1,boht acha moble hy
118,119,Maa shaa ALLAH tasbih khane ky wazif lajawb ha.,Positive,1,maa shaa allah tasbih khane ky wazif lajawb ha .
2605,2604,Phr b begerat aur dheet awam inhi ko vote degi...,Negative,0,phr b begerat aur dheet awam inhi ko vote degi ...
4446,4445,"1998 mein Karan Johar ki film “kuch kuch hota hai” saal ki behtareen film qarar pai, is film mein Salman Khan ne bator mehman adakar ke tor par kaam kiya lekin unhon ne ye mukhtasir kirdar is khub...",Positive,1,"mein karan johar ki film “ kuch kuch hota hai "" saal ki behtareen film qarar pai , be film mein salman khan ne bator mehman adakar ke tor par kaam kiya lekin unhon ne ye mukhtasir kirdar be khubi ..."
1049,1050,Main intehayi pareshan hal insan hun help me sir i need you,Negative,0,main intehayi pareshan hal insan hun help -PRON- sir i need -PRON-
166,167,Aap ki yehi salahiyat 1937 mein aap ko Islamia College ke magazine �EEE€�EEEcrescent�EEE€�EEE ka naib mudeer banwane meinmadadgar sabit hoi,Positive,1,aap ki yehi salahiyat mein aap ko islamia college ke magazine � eee€ � eeecrescent � eee€ � eee ka naib mudeer banwane meinmadadgar sabit hoi
5707,5705,Murshad da dedar hy lakh karoran hajjan,Positive,1,murshad da dedar hy lakh karoran hajjan
3418,3417,UNHY jinsi toor br harasa krny ki kosis nhi blky aye roz unhy harasa kea jata hy.,Negative,0,unhy jinsi toor br harasa krny ki kosis nhi blky aye roz unhy harasa kea jata hy .
6202,6200,Agar banned krna he tu sab ko banned kru dramo ko banned kru filmo ko banned ko not only sahil okk sab behayi he pir,Negative,0,agar ban krna -PRON- tu sab ko ban kru dramo ko ban kru filmo ko ban ko not only sahil okk sab behayi -PRON- pir


In [12]:
import tensorflow_hub as hub
import tensorflow as tf

  from ._conv import register_converters as _register_converters
W0605 15:11:00.147042  1372 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [13]:
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

W0605 15:11:00.669757  1372 deprecation.py:323] From C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


In [None]:
# just a random sentence
x = ["Roasted ants are a popular snack in Columbia"]
# Extract ELMo features
embeddings = elmo(x, signature="default", as_dict=True)["elmo"]
embeddings.shape

TensorShape([Dimension(1), Dimension(8), Dimension(1024)])

In [None]:
def elmo_vectors(x):
  embeddings = elmo(x.tolist(), signature="default", as_dict=True)["elmo"]
  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    # return average of ELMo features
    return sess.run(tf.reduce_mean(embeddings,1))

list_train = [train[i:i+100] for i in range(0,train.shape[0],100)]
list_test = [test[i:i+100] for i in range(0,test.shape[0],100)]
# Extract ELMo embeddings
elmo_train = [elmo_vectors(x['clean_tweet']) for x in list_train]
elmo_test = [elmo_vectors(x['clean_tweet']) for x in list_test]

In [None]:
elmo_train_new = np.concatenate(elmo_train, axis = 0)
elmo_test_new = np.concatenate(elmo_test, axis = 0)

# save elmo_train_new
pickle_out = open("elmo_train_03032019.pickle","wb")
pickle.dump(elmo_train_new, pickle_out)
pickle_out.close()

# save elmo_test_new
pickle_out = open("elmo_test_03032019.pickle","wb")
pickle.dump(elmo_test_new, pickle_out)
pickle_out.close()

In [None]:
# load elmo_train_new
pickle_in = open("elmo_train_03032019.pickle", "rb")
elmo_train_new = pickle.load(pickle_in)
# load elmo_train_new
pickle_in = open("elmo_test_03032019.pickle", "rb")
elmo_test_new = pickle.load(pickle_in)

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xvalid, ytrain, yvalid = train_test_split(elmo_train_new, train['label'], random_state=42, test_size=0.2)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

lreg = LogisticRegression()
lreg.fit(xtrain, ytrain)

In [None]:
preds_valid = lreg.predict(xvalid)

In [None]:
f1_score(yvalid, preds_valid)

In [None]:
# make predictions on test set
preds_test = lreg.predict(elmo_test_new)

In [None]:
# prepare submission dataframe
sub = pd.DataFrame({'id':test['id'], 'label':preds_test})

# write predictions to a CSV file
sub.to_csv("sub_lreg.csv", index=False)