<a href="https://colab.research.google.com/github/tamtemtomm/kaggle-notebooks/blob/main/Natural_Language_Processing_with_Disaster_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Dependencies

In [1]:
# @title <p>Essential Import
import os, shutil, json
from PIL import Image
from zipfile import ZipFile
import matplotlib.pyplot as plt
import numpy as np, pandas as pd, random as rd
import warnings
warnings.filterwarnings("ignore")

In [113]:
# @title <p> Essential NLP Import
import re, spacy, nltk, numpy as np
from textblob import TextBlob

nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

from nltk.stem.lancaster import LancasterStemmer
# from nltk.stem.porter import PorterStemmer
# from nltk.stem.snowball import SnowballStemmer

nltk_stw_en = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Kaggle Authentication

In [2]:
!pip install -q kaggle

In [None]:
# @title <p> Import kaggle API
# from google.colab import files
# files.upload()

# ! mkdir ~/.kaggle
# ! cp kaggle.json ~/.kaggle
# ! chmod 600 ~/.kaggle/kaggle.json

In [4]:
# @title <p>Import kaggle API from google drive
from google.colab import drive
drive.mount('/content/gdrive')

! mkdir ~/.kaggle
! cp '/content/gdrive/MyDrive/Colab Notebooks/kaggle.json' ~/.kaggle
! chmod 600 ~/.kaggle/kaggle.json

drive.flush_and_unmount()

Mounted at /content/gdrive


### Inspect Dataset

In [7]:
# @title <p>Download Dataset
!kaggle competitions download -c nlp-getting-started
!unzip nlp-getting-started.zip &> /dev/null
!rm nlp-getting-started.zip

In [9]:
# @title <p> Get train data
train : pd.DataFrame = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [10]:
# @title <p> Get test data
test : pd.DataFrame = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [11]:
# @title <p> Get sample_submission data
sample_submission : pd.DataFrame = pd.read_csv('sample_submission.csv')
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


## Explore Datasets

### VIbe Check

In [12]:
# @title <p> Dataset info
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [13]:
# @title <p> Dataset describe
train.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [31]:
# @title <p> Check null values percentage
for col in train.columns:
  if col != 'id':
    print(f'Null values for {col} : {len(train[train[col].isna()])} ({len(train[train[col].isna()])/len(train)*100}%)')

Null values for keyword : 61 (0.8012610009194797%)
Null values for location : 2533 (33.27203467752528%)
Null values for text : 0 (0.0%)
Null values for target : 0 (0.0%)


## Preprocessing

In [127]:
# @title <p> Initialize preprocess function

tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
nltk_stw_en = nltk_stw_en + ["oh"]
link_starters = ['http', "@", "\x89"]
stemmer = LancasterStemmer()


def preprocess_text(texts: str):
    # Sentence tokenize
    texts = [nltk.sent_tokenize(text)[0] for text in texts]

    # Remove link
    for link_starter in link_starters :
      texts = [" ".join([t for t in text.split() if link_starter not in t]) for text in texts]

    # Casefolding, change /n to space,
    texts = [text.strip().lower().replace("\n", " ") for text in texts]

    # Remove symbols
    texts = [re.sub(r"[^a-zA-Z' ]", ' ', text) for text in texts]

    # Remove repetitive space
    texts = [re.sub(' +', ' ', text).strip() for text in texts]

    # Remove stopwords
    texts = [' '.join([t for t in text.split() if t not in nltk_stw_en]) for text in texts]

    # Tweet tokenize
    texts = [' '.join(tweet_tokenizer.tokenize(text)) for text in texts]

    return texts[0]

texts = preprocess_text(train.text)
texts

'deeds reason earthquake may allah forgive us'

'd'