# Preprocessing

In [None]:
import pandas as pd

#data import
df_train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
df_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

## Overview

In [None]:
df_train.head()

In [None]:
df_train.shape

In [None]:
df_train.target.value_counts()

The dataset is imbalanced, it might be relevant to perform resampling

In [None]:
#ex of a disaster tweet
df_train.loc[df_train.target==1].text.values[0]

In [None]:
#ex of another tweet
df_train.loc[df_train.target==0].text.values[0]

### Missing values

In [None]:
df_train.isna().sum()

In [None]:
df_train.keyword.value_counts()

In [None]:
df_train.location.value_counts()

## Transformation of the data

### Preprocessing

In [None]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub('[0-9]', '', text)
    text = re.sub('#', '', text)
    text = re.sub("%20", "", text)
    return text

df_train["text"] = df_train["text"].apply(clean_text)
df_test["text"] = df_test["text"].apply(clean_text)

## Ridge classifier

### First attempt with just the text

In [None]:
from sklearn import feature_extraction, linear_model, model_selection, preprocessing, metrics

#we build vectors that count the number of words in the sentences of the dataset
count_vectorizer = feature_extraction.text.CountVectorizer() #we remove stop words
X_train = count_vectorizer.fit_transform(df_train.text)
y_train = df_train.target
X_test = count_vectorizer.transform(df_test.text)

In [None]:
clf = linear_model.RidgeClassifier()
scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring="recall")
print("scores : ", scores)

In [None]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission.target = clf.predict(X_test)
sample_submission.to_csv("submission.csv", index=False)

### Second attempt with just the keyword

In [None]:
#First we need to fill the nan values
df_train.location = df_train.location.fillna("unknown_location") 
df_train.keyword = df_train.keyword.fillna("no_keyword") #first approach, we can also try to assign the keywords ourselves
df_test.location = df_test.location.fillna("unknown_location") 
df_test.keyword = df_test.keyword.fillna("no_keyword") #first approach, we can also try to assign the keywords ourselves

In [None]:
X_train = count_vectorizer.fit_transform(df_train.keyword)
X_test = count_vectorizer.transform(df_test.keyword)

In [None]:
clf = linear_model.RidgeClassifier()
scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring="f1")
print("scores : ", scores)

### Third attempt with just the location

In [None]:
X_train = count_vectorizer.fit_transform(df_train.location)
X_test = count_vectorizer.transform(df_test.location)

In [None]:
clf = linear_model.RidgeClassifier()
scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring="f1")
print("scores : ", scores)

### Fourth attempt with all the information

In [None]:
df_train["complete_information"] = df_train["text"] + " " + df_train["keyword"] + " " + df_train["location"]
df_test["complete_information"] = df_test["text"] + " " + df_test["keyword"] + " " + df_test["location"]

In [None]:
X_train = count_vectorizer.fit_transform(df_train.complete_information)
X_test = count_vectorizer.transform(df_test.complete_information)

In [None]:
clf = linear_model.RidgeClassifier()
scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring="f1")
print("scores : ", scores)

In [None]:
clf.fit(X_train, y_train)
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission.target = clf.predict(X_test)
sample_submission.to_csv("sub_RidgeClassifier_CompleteInfo_imbalanced.csv", index=False)

In [None]:
#we take into account the fact that the data in imbalanced
clf = linear_model.RidgeClassifier(class_weight = "balanced")
scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring="f1")
print("scores : ", scores)

In [None]:
clf.fit(X_train, y_train)
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission.target = clf.predict(X_test)
sample_submission.to_csv("sub_RidgeClassifier_CompleteInfo_balanced.csv", index=False)

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_corpus = df_train.text.values
test_corpus = df_test.text.values

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_corpus)
X_test = vectorizer.transform(test_corpus)

scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring="f1")

print("scores :", scores)

In [None]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

sample_submission.target = y_pred
sample_submission.to_csv("sample_submission_tfidf.csv", index=False)

## LSA 

In [None]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

X_sparse = count_vectorizer.fit_transform(df_train.text)
nb_words = X_sparse.shape[1]
l_nb_comp = [int(1/100 * i * nb_words) for i in [1, 2, 5, 10]]
l_explained_var = []
for nb_comp in l_nb_comp:
    svd = TruncatedSVD(n_components=nb_comp, n_iter=7, random_state=42)
    X_train = svd.fit_transform(X_sparse)
    explained_var = svd.explained_variance_ratio_.sum()
    l_explained_var.append(np.round(explained_var, 2))
    
print(l_nb_comp)
print(l_explained_var)

## LSTM

In [None]:
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import one_hot

corpus = pd.concat([df_train, df_test], axis=0).text.values
train_corpus = df_train.text.values
test_corpus = df_test.text.values

all_words = []
max_nbwords_sent = 0
for sent in corpus:
    tokenize_word = word_tokenize(sent)
    nbwords_sent = len(tokenize_word)
    if nbwords_sent > max_nbwords_sent:
        max_nbwords_sent = nbwords_sent
    for word in tokenize_word:
        all_words.append(word)
print(max_nbwords_sent)
    

unique_words = set(all_words)
vocab_length = len(unique_words)


In [None]:
int(vocab_length**(1/4))

In [None]:
train_embedded_sentences = [one_hot(sent, vocab_length) for sent in train_corpus]
test_embedded_sentences = [one_hot(sent, vocab_length) for sent in test_corpus]

In [None]:
from keras.preprocessing import sequence

X_train = sequence.pad_sequences(train_embedded_sentences, maxlen=max_nbwords_sent) 
y_train = df_train.target
X_test = sequence.pad_sequences(test_embedded_sentences, maxlen=max_nbwords_sent)

In [None]:
from keras.models import Sequential 
from keras.layers import Dense, Dropout, Embedding, LSTM 
from keras.datasets import imdb 

def get_model():
    model = Sequential() 
    model.add(Embedding(input_dim=vocab_length, output_dim=int(vocab_length**(1/4)), input_length=max_nbwords_sent)) 
    model.add(Dropout(0.2)) 
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2)) 
    model.add(Dense(250, activation='relu')) 
    model.add(Dropout(0.2)) 
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=3,random_state=50,shuffle=True)
l_score = []
for training_ids, test_ids in kf.split(X_train):
    mod = get_model()
    mod.fit(X_train[training_ids], y_train[training_ids], batch_size=64, epochs=5, validation_data=(X_train[test_ids], y_train[test_ids]))
    score = mod.evaluate(X_train[test_ids], y_train[test_ids])
    l_score.append(score)
print(l_score)
    
    

After 2 epochs, we start overfitting since the val_accuracy reduces.
Thus, we train the model with the whole dataset and only 2 epochs before getting its prediction

In [None]:
model = get_model()
model.fit(X_train, y_train, batch_size=64, epochs=2)
y_pred = model.predict(X_test)

sample_submission.target = np.round(y_pred, 0).astype(int)
sample_submission.to_csv("sample_submission_lstm_2pochs.csv", index=False)