# Détections des catastrophes dans des tweets

Compétition Kaggle [Natural Language Processing with Disaster Tweets](https://www.kaggle.com/c/nlp-getting-started/overview)

## TODO liste

- [ ] faire un preprocessing sur le text : supprimer les stop-words, la ponctuation, ...
- [ ] extraire les hastag comme des features à part

## Dépendances

In [2]:
import pandas as pd
import numpy as np
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

## Import des données

In [5]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

## Modèle baseline 

In [9]:
count_vectorizer = feature_extraction.text.CountVectorizer()
train_vectors = count_vectorizer.fit_transform(train_df["text"])
test_vectors = count_vectorizer.transform(test_df["text"])
clf = linear_model.RidgeClassifier()
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

array([0.59421842, 0.56455572, 0.64082434])

## Baseline avec TFIDF et tokenisation

In [89]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.base import BaseEstimator, TransformerMixin

class UnSparse(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.toarray()

clf = Pipeline([
    ('vectorize', TfidfVectorizer()),
#     ('un_sparse', UnSparse()),
#     ('scaler', StandardScaler()),
    ('model_clf', RidgeClassifier()),
])

corpus  = train_df['text']

scores = model_selection.cross_val_score(clf, corpus, train_df["target"], cv=10, scoring="f1")
print(np.mean(scores))
scores

0.613861444610996


array([0.6514658 , 0.57735247, 0.53373313, 0.52262774, 0.59643917,
       0.64083458, 0.61806656, 0.5625    , 0.71671388, 0.71888112])

## Basline avec prétraitement du text

In [86]:
import re
from nltk.stem.snowball import EnglishStemmer
import nltk
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import stopwords

# nltk.download('stopwords')
sw = stopwords.words('english') + ['the', 'a', 'and', 'is', 'be', 'will']

# nltk.download('punkt')

class CleanupText(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 stop_words=None, 
                 remove_stop_words=True, 
                 remove_punctuations=True, 
                 clean_whitespaces=True):
        self.stop_words = stop_words or sw
        self.remove_stop_words = remove_stop_words
        self.remove_punctuations = remove_punctuations
        self.clean_whitespaces = clean_whitespaces
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return np.array([self.clean_text(txt) for txt in X])

    def clean_text(
        self,
        string: str, 
        punctuations: str = r'''!()-[]{};:'"\,<>./?@#$%^&*_~|''') -> str:
        """
        A method to clean text 
        """
        # Cleaning the urls
        string = re.sub(r'https?://\S+|www\.\S+', '', string)

        # Cleaning the html elements
        string = re.sub(r'<.*?>', '', string)

        # Removing the punctuations
        if self.remove_punctuations:
            for x in string.lower(): 
                if x in punctuations: 
                    string = string.replace(x, "") 

        # Converting the text to lower
        string = string.lower()

        # Removing stop words
        if self.remove_stop_words:
            string = ' '.join([word for word in string.split() if word not in self.stop_words])

        # Cleaning the whitespaces
        if self.clean_whitespaces:
            string = re.sub(r'\s+', ' ', string).strip()

        return string

In [87]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import GridSearchCV

clf = Pipeline([
    ('text_clean', CleanupText()),
    ('vectorize', TfidfVectorizer()),
    ('model_clf', RidgeClassifier()),
])

param_grid = [
    {'text_clean__remove_stop_words': [True, False], 
     'text_clean__remove_punctuations': [True, False], 
     'text_clean__clean_whitespaces': [True, False]},
]

grid_search = GridSearchCV(clf, param_grid, cv=10, scoring='f1')
grid_search.fit(train_df['text'], train_df["target"])
print(grid_search.best_params_)
grid_search.best_score_

{'text_clean__clean_whitespaces': True, 'text_clean__remove_punctuations': False, 'text_clean__remove_stop_words': False}


0.5908561245751591

## Random Forest

In [65]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

clf = Pipeline([
    ('vectorize', TfidfVectorizer()),
    ('model_clf', RandomForestClassifier(n_jobs=-1)),
])

param_grid = [
    {'model_clf__n_estimators': [100, 200, 500], 'model_clf__max_leaf_nodes': [64, None]},
]
grid_search = GridSearchCV(clf, param_grid, cv=3, scoring='f1')
grid_search.fit(train_df['text'], train_df["target"])
grid_search.best_params_

{'model_clf__max_leaf_nodes': None, 'model_clf__n_estimators': 500}

In [66]:
grid_search.best_score_

0.5680143763271022

## Word embeding

from [this Medium post](https://medium.com/analytics-vidhya/text-classification-using-word-embeddings-and-deep-learning-in-python-classifying-tweets-from-6fe644fcfc81)