<a href="https://colab.research.google.com/github/tamtemtomm/AnswerWithPaper/blob/master/Natural_Language_Processing_with_Disaster_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Dependencies

In [None]:
# @title <p>Essential Import
import os, shutil, json
from PIL import Image
from zipfile import ZipFile
import matplotlib.pyplot as plt
import numpy as np, pandas as pd, random as rd
import warnings
warnings.filterwarnings("ignore")

In [None]:
# @title <p> Essential NLP Import
import re, spacy, nltk, numpy as np
from textblob import TextBlob

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

from nltk.stem import WordNetLemmatizer
# from nltk.stem.lancaster import LancasterStemmer
# from nltk.stem.porter import PorterStemmer
# from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

nltk_stw_en = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# @title <p>Sklearn Essential Import
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix, recall_score, precision_score, accuracy_score, f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB

### Kaggle Authentication

In [None]:
!pip install -q kaggle

In [None]:
# @title <p> Import kaggle API
# from google.colab import files
# files.upload()

# ! mkdir ~/.kaggle
# ! cp kaggle.json ~/.kaggle
# ! chmod 600 ~/.kaggle/kaggle.json

In [None]:
# @title <p>Import kaggle API from google drive
from google.colab import drive
drive.mount('/content/gdrive')

! mkdir ~/.kaggle
! cp '/content/gdrive/MyDrive/Colab Notebooks/kaggle.json' ~/.kaggle
! chmod 600 ~/.kaggle/kaggle.json

drive.flush_and_unmount()

Mounted at /content/gdrive


### Inspect Dataset

In [None]:
# @title <p>Download Dataset
!kaggle competitions download -c nlp-getting-started
!unzip nlp-getting-started.zip &> /dev/null
!rm nlp-getting-started.zip

Downloading nlp-getting-started.zip to /content
  0% 0.00/593k [00:00<?, ?B/s]
100% 593k/593k [00:00<00:00, 74.9MB/s]


In [None]:
# @title <p> Get train data
train : pd.DataFrame = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
# @title <p> Get test data
test : pd.DataFrame = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
# @title <p> Get sample_submission data
sample_submission : pd.DataFrame = pd.read_csv('sample_submission.csv')
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


## Explore Datasets

### VIbe Check

In [None]:
# @title <p> Dataset info
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [None]:
# @title <p> Dataset describe
train.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [None]:
# @title <p> Check null values percentage
for col in train.columns:
  if col != 'id':
    print(f'Null values for {col} : {len(train[train[col].isna()])} ({len(train[train[col].isna()])/len(train)*100}%)')

Null values for keyword : 61 (0.8012610009194797%)
Null values for location : 2533 (33.27203467752528%)
Null values for text : 0 (0.0%)
Null values for target : 0 (0.0%)


## Preprocessing

In [None]:
# @title <p> Initialize preprocess constants
tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
nltk_stw_en = nltk_stw_en + ["oh"]
link_starters = ['http', "@", "\x89"]
lemmatizer = WordNetLemmatizer()

In [None]:
# @title <p> Initialize preprocess text function

def preprocess_text(text: str):
    # Sentence tokenize
    text = " ".join(nltk.sent_tokenize(text))

    # Remove link
    for link_starter in link_starters :
      text = " ".join([t for t in text.split() if link_starter not in t])

    # Casefolding, change /n to space,
    text = text.strip().lower().replace("\n", " ")

    # Remove symbols
    text = re.sub(r"[^a-zA-Z' ]", ' ', text)

    # Remove repetitive space
    text = re.sub(' +', ' ', text).strip()

    # Remove stopwords
    text = ' '.join([t for t in text.split() if t not in nltk_stw_en])

    # Tweet tokenize
    text = ' '.join(tweet_tokenizer.tokenize(text))

    # Wordnet lemmatize
    text = ' '.join([lemmatizer.lemmatize(t) for t in text.split() if len(t) > 1])

    return text

texts = [preprocess_text(text) for text in train.text if type(text) == str]
texts[:5]

['deed reason earthquake may allah forgive u',
 'forest fire near la ronge sask canada',
 'resident asked shelter place notified officer evacuation shelter place order expected',
 'people receive wildfire evacuation order california',
 'got sent photo ruby alaska smoke wildfire pours school']

In [None]:
# @title <p> Apply function into df
def preprocess_df(df:pd.DataFrame, test=False):
  df = df.copy()

  if test:
    df = df.iloc[:, 1:]
  else :
    df = df.iloc[:, 1:-1]

  for i, data in enumerate(df.values):
    for j, d in enumerate(data):
      if type(d) == str:
        df.iloc[i, j] = preprocess_text(d)

  df = df.fillna("")

  return df

X_train_preprocessed = preprocess_df(train)
X_test_preprocessed = preprocess_df(test, test=True)

## Modelling

In [None]:
# @title <p> Make the corpus
text_corpus = np.concatenate((X_train_preprocessed.text.values, X_test_preprocessed.text.values))
keyword_corpus = np.concatenate((X_train_preprocessed.keyword.values, X_test_preprocessed.keyword.values))
location_corpus = np.concatenate((X_train_preprocessed.location.values, X_test_preprocessed.location.values))

In [None]:
# @title <p> Fit the vectorizer
text_vectorizer = TfidfVectorizer(max_features=2870)
keyword_vectorizer = TfidfVectorizer(max_features=2870)
location_vectorizer = TfidfVectorizer(max_features=2870)

X_text = text_vectorizer.fit_transform(X_train_preprocessed.text).toarray()
X_keyword = keyword_vectorizer.fit_transform(X_train_preprocessed.keyword).toarray()
X_location = location_vectorizer.fit_transform(X_train_preprocessed.location).toarray()
X = np.concatenate([X_text, X_location], axis=1)

y = np.array(train.target)

In [None]:
# @title <p> Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15, random_state=0)

In [None]:
# @title <p> Model the data
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy_score(y_test, y_pred)

0.7880910683012259

## Evaluation

## Submission

In [None]:
X_sub_keyword = keyword_vectorizer.transform(X_test_preprocessed.keyword).toarray()
X_sub_text = text_vectorizer.transform(X_test_preprocessed.text).toarray()
X_sub_location = location_vectorizer.transform(X_test_preprocessed.location).toarray()
X_sub = np.concatenate([X_sub_text, X_sub_location], axis=1)

y_sub = clf.predict(X_sub)

In [None]:
sub = sample_submission.copy()
sub.target = y_sub
sub.to_csv('submission.csv', index=False)

In [None]:
# @title <p> Submit
!kaggle competitions submit -c nlp-getting-started -f 'submission.csv' -m 'First submission'

100% 22.2k/22.2k [00:00<00:00, 63.9kB/s]
Successfully submitted to Natural Language Processing with Disaster Tweets