# Exploratory Data Analysis (EDA)
### Import

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from collections import Counter

plt.style.use('ggplot')
stop=set(stopwords.words('english'))

import re
from nltk.tokenize import word_tokenize
import gensim
import string

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
tweet = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')
print('tweet.shape: ', tweet.shape)
print('test.shape: ', test.shape)
tweet.head()

In [None]:
tweet.info()
test.info()

### Class Distribution

In [None]:
x = tweet.target.value_counts()
sns.barplot(x.index, x)
plt.gca().set_ylabel('samples')

In [None]:
# Another way to do count plot in Seaborn
sns.set(style='darkgrid')
sns.countplot(x='target', data=tweet)

Almost similar number of records for both

### Number of characters, words, length

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,5))
tweet_len = tweet[tweet['target']==1]['text'].str.len()
ax1.hist(tweet_len, color='red')
ax1.set_title('disaster tweets')
tweet_len = tweet[tweet['target']==0]['text'].str.len()
ax2.hist(tweet_len, color='green')
ax2.set_title('Not disaster tweets')
fig.suptitle('Characters in tweets')
plt.show()

Around 100-150 characters per tweet

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,5))
tweet_len = tweet[tweet['target']==1]['text'].str.split().map(lambda x: len(x))
ax1.hist(tweet_len, color='red')
ax1.set_title('disaster tweets')
tweet_len = tweet[tweet['target']==0]['text'].str.split().map(lambda x: len(x))
ax2.hist(tweet_len, color='green')
ax2.set_title('Not disaster tweets')
fig.suptitle('Words in tweets')
plt.show()

Around 10-20 words per tweet

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,5))
word = tweet[tweet['target']==1]['text'].str.split().map(lambda x: [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)), ax=ax1, color='red')
ax1.set_title('disaster tweets')
word = tweet[tweet['target']==0]['text'].str.split().map(lambda x: [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)), ax=ax2, color='green')
ax2.set_title('Not disaster tweets')
fig.suptitle('Average word lengths in each tweet')

* Disaster - around 7 charactors per word
* Not disaster - around 5 charactors per word

### Common Stop Words

In [None]:
def create_corpus(target):
    corpus = []
    
    for x in tweet[tweet['target']==target]['text'].str.split():
        for i in x:
            corpus.append(i)
    return corpus

In [None]:
# Analyze Not disaster tweets
corpus = create_corpus(0)

dic = defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word] += 1

top = sorted(dic.items(), key=lambda x: x[1], reverse=True)[:10]

x, y = zip(*top)
plt.bar(x, y)

In [None]:
# Analyze disaster tweets
corpus = create_corpus(1)

dic = defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word] += 1

top = sorted(dic.items(), key=lambda x: x[1], reverse=True)[:10]

x, y = zip(*top)
plt.bar(x, y)

Not much difference between Disaster and Not disaster

### Punctuation

In [None]:
# Analyze Not disaster tweets
corpus = create_corpus(0)

dic = defaultdict(int)
import string
special = string.punctuation
for word in corpus:
    if word in special:
        dic[word] += 1

x, y = zip(*dic.items())
plt.figure(figsize=(10, 5))
plt.bar(x, y)

In [None]:
# Analyze disaster tweets
corpus = create_corpus(1)

dic = defaultdict(int)
import string
special = string.punctuation
for word in corpus:
    if word in special:
        dic[word] += 1

x, y = zip(*dic.items())
plt.figure(figsize=(10, 5))
plt.bar(x, y)

### Common words not in stopwords

In [None]:
counter = Counter(corpus)
most = counter.most_common()
x, y = [], []
for word, count in most[:40]:
    if word not in stop:
        x.append(word)
        y.append(count)

sns.barplot(x=y, y=x)

### Ngram analysis

In [None]:
def get_top_tweet_bigrams(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
top_tweet_bigrams = get_top_tweet_bigrams(tweet['text'])[:10]
top_tweet_bigrams

In [None]:
plt.figure(figsize=(10,5))
x, y = map(list, zip(*top_tweet_bigrams))
sns.barplot(x=y, y=x)

### Hashtag Analysis

In [None]:
def find_hashtags(text):
    return ','.join(match.group(0)[1:].lower() for match in re.finditer(r'#\w+', text)) or None

def create_hashtag_corpus(target):
    corpus_ht = []
    hashtags = tweet[tweet['target']==target]['text'].apply(lambda x: find_hashtags(x))
    print('The number of tweets having hashtag(s): ', len([x for x in hashtags if isinstance(x, str)]))
    hashtags.fillna(value='no', inplace=True)
    
    for x in hashtags.str.split(','):
        for i in x:
            if i != 'no':
                corpus_ht.append(i)
    return corpus_ht

In [None]:
corpus_ht = create_hashtag_corpus(0)
dic = defaultdict(int)
for word in corpus_ht:
    dic[word] += 1

top = sorted(dic.items(), key=lambda x: x[1], reverse=True)[:20]
plt.figure(figsize=(10,6))
x, y = map(list, zip(*top))
sns.barplot(x=y, y=x)

In [None]:
corpus_ht = create_hashtag_corpus(1)
dic = defaultdict(int)
for word in corpus_ht:
    dic[word] += 1

top = sorted(dic.items(), key=lambda x: x[1], reverse=True)[:20]
plt.figure(figsize=(10,6))
x, y = map(list, zip(*top))
sns.barplot(x=y, y=x)

### Keyword

In [None]:
#Remove the encoded space character
tweet['keyword'] = tweet['keyword'].map(lambda s: s.replace('%20', ' ') if isinstance(s, str) else s)

kw_unique  = {kw for kw in tweet['keyword'].values if isinstance(kw, str)}
kw_total = len(tweet) - len(tweet[tweet["keyword"].isna()])

print("Unique Keyword / Total: {} / {}".format(len(kw_unique), kw_total))
print("Tweets with no keyword: {}".format(len(tweet[tweet["keyword"].isna()])))

In [None]:
kw_disaster = [kw for kw in tweet.loc[tweet.target == 1].keyword]
kw_not_disaster = [kw for kw in tweet.loc[tweet.target == 0].keyword]

kw_disaster_cn = dict(pd.DataFrame(data={'x': kw_disaster}).x.value_counts())
kw_not_disaster_cn = dict(pd.DataFrame(data={'x': kw_not_disaster}).x.value_counts())
kw_all_cn =  dict(pd.DataFrame(data={'x': tweet.keyword.values}).x.value_counts())

for keyword, _ in sorted(kw_all_cn.items(), key=lambda x: x[1], reverse=True)[:10]:
    print("> Keyword: {}".format(keyword))
    print("-- # in disaster tweets:     {}".format(kw_disaster_cn.get(keyword, 0)))
    print("-- # in not disaster tweets: {}".format(kw_not_disaster_cn.get(keyword, 0)))
    print('--------')

Some keywords are common, some are only for not disaster
### Location

In [None]:
loc_unique  = {loc for loc in tweet['location'].values if isinstance(loc, str)}
loc_total = len(tweet) - len(tweet[tweet["location"].isna()])

print("Unique Location / Total: {} / {}".format(len(loc_unique), loc_total))
print("Tweets with no Location: {}".format(len(tweet[tweet["location"].isna()])))

In [None]:
loc_disaster = [loc for loc in tweet.loc[tweet.target == 1].location]
loc_not_disaster = [loc for loc in tweet.loc[tweet.target == 0].location]

loc_disaster_cn = dict(pd.DataFrame(data={'x': loc_disaster}).x.value_counts())
loc_not_disaster_cn = dict(pd.DataFrame(data={'x': loc_not_disaster}).x.value_counts())
loc_all_cn =  dict(pd.DataFrame(data={'x': tweet.location.values}).x.value_counts())

for location, _ in sorted(loc_all_cn.items(), key=lambda x: x[1], reverse=True)[:10]:
    print("> Location: {}".format(location))
    print("-- # in disaster tweets:     {}".format(loc_disaster_cn.get(location, 0)))
    print("-- # in not disaster tweets: {}".format(loc_not_disaster_cn.get(location, 0)))
    print('--------')

We may use this but the data is too sparce

# Data Cleaning
* Remove URLs
* Remove HTML Tags
* Remove Emoji
* Remove Punctuations

In [None]:
df = pd.concat([tweet, test])
print(df.shape)
df

### Removing URLs

In [None]:
example="This is a test message :https://www.abc.com/test/sample-project"

In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

remove_URL(example)

In [None]:
df['text'] = df['text'].apply(lambda x: remove_URL(x))

### Removing HTML tags

In [None]:
example = """<div>
<h1>Real or Fake</h1>
<p>Kaggle </p>
<a href="https://www.kaggle.com/c/nlp-getting-started">getting started</a>
</div>"""

In [None]:
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

print(remove_html(example))

In [None]:
df['text'] = df['text'].apply(lambda x: remove_html(x))

### Removing Emojis

In [None]:
example = "Omg another Earthquake 😔😔"

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

remove_emoji(example)

In [None]:
df['text']=df['text'].apply(lambda x: remove_emoji(x))

### Removing punctuations

In [None]:
example="I am a #king"

In [None]:
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

print(remove_punct(example))

In [None]:
df['text']=df['text'].apply(lambda x : remove_punct(x))

# Model, Train and Predict
## Utility

In [None]:
def test_save(model, test_df, name_pre='test'):
    sample_sub = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
    y_pre = model.predict(test_df.text)
    y_pre = np.round(y_pre).astype(int).reshape(3263)
    sub = pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_pre})
    sub_name = name_pre + "_submission.csv"
    sub.to_csv(sub_name, index=False)

## A) tfidf + Logistic Regression as a baseline model

In [None]:
from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

def build_tfidf_logreg(df, validation=False):
    x_train = df['text']
    y_train = df['target']
    clf = Pipeline([
        ('tfidf', TfidfVectorizer(analyzer='word')),
        ('clf', LogisticRegression())
        ])

    if validation:
        x_train, x_valid, y_train, y_valid = model_selection.train_test_split(x_train.values, y_train.values, test_size=0.2, random_state=7)
        print(x_train, y_train)
        clf.fit(x_train, y_train)
        predicted_train = clf.predict(x_train)
        predicted_valid = clf.predict(x_valid)
        print("Training Accuracy: ", np.mean(predicted_train == y_train))
        print("Validation Accuracy: ", np.mean(predicted_valid == y_valid))
    
    else:
        clf.fit(x_train, y_train)
    
    return clf

train_df = df[:tweet.shape[0]]
test_df = df[tweet.shape[0]:]

model = build_tfidf_logreg(train_df, validation=False)

test_save(model, test_df, "a_tfidf")

## B) LSTM
### GloVe Word Embedding

In [None]:
def create_corpus(df):
    corpus = []
    for tweet in tqdm(df['text']):
        words = [word.lower() for word in word_tokenize(tweet) if ((word.isalpha()==1) & (word not in stop))]
        corpus.append(words)
    return corpus

In [None]:
corpus = create_corpus(df)

In [None]:
os.listdir('../input/glove-global-vectors-for-word-representation/')

In [None]:
# Load a Lookup table
embedding_dict={}
with open('../input/glove-global-vectors-for-word-representation/glove.6B.200d.txt','r') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()


In [None]:
# Tokenize corpus
MAX_LEN = 50
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences = tokenizer_obj.texts_to_sequences(corpus)
print(sequences)
tweet_pad = pad_sequences(sequences, maxlen=MAX_LEN, truncating='post', padding='post')
print(tweet_pad)

In [None]:
word_index = tokenizer_obj.word_index
print('Number of unique words:', len(word_index))

In [None]:
# Embed words
num_words = len(word_index)+1
embedding_matrix = np.zeros((num_words, 200))

for word, i in tqdm(word_index.items()):
    if i > num_words:
        continue
        
    embed_vec = embedding_dict.get(word)
    if embed_vec is not None:
        embedding_matrix[i] = embed_vec

### Model and Test

In [None]:
model = Sequential()

embedding = Embedding(num_words, 200, embeddings_initializer=Constant(embedding_matrix), input_length=MAX_LEN, trainable=False)
model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

optimizer = Adam(learning_rate=1e-5)

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model.summary()

In [None]:
train = tweet_pad[:tweet.shape[0]]
test = tweet_pad[tweet.shape[0]:]

X_train,X_test,y_train,y_test=train_test_split(train,tweet['target'].values,test_size=0.15)

print('Shape of train',X_train.shape)
print("Shape of Validation ",X_test.shape)

In [None]:
history=model.fit(X_train,y_train,batch_size=4,epochs=10,validation_data=(X_test,y_test),verbose=2)

In [None]:
test_save(model, test, "b_LSTM")