In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
%matplotlib inline
sns.set_style('whitegrid')

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

In [3]:
file_train = './DataSets/nlp-disaster/train.csv'
file_test = './DataSets/nlp-disaster/test.csv'

In [4]:
train = pd.read_csv(file_train)
test = pd.read_csv(file_test)

## About Data
**id: a unique identifier for each tweet**  
**text: The text of a tweet**  
**keyword: A keyword from that tweet (although this may be blank!)**  
**location: The location the tweet was sent from (may also be blank)**  
**target: in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)**

# 1.0 - Exploratory Data Analysis

In [5]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
id          7613 non-null int64
keyword     7552 non-null object
location    5080 non-null object
text        7613 non-null object
target      7613 non-null int64
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [7]:
## Checking for NaN values
train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

# 2.0 Manipulation and Cleaning

In [8]:
## Get only no null columns
tweets = train[['text','target']]
tweets.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [9]:
def remove_punct(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_stop(text):
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    return " ".join(text)

def stemming(text):    
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text) 

In [10]:
## Remove Punctuation on text
tweets['text'] = tweets['text'].apply(remove_punct)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [14]:
## Remove Stop Words
sw = stopwords.words('english')
tweets['text'] = tweets['text'].apply(remove_stop)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
tweets.head()

Unnamed: 0,text,target
0,deed reason earthquak may allah forgiv us,1
1,forest fire near la rong sask canada,1
2,resid ask shelter place notifi offic evacu she...,1
3,13000 peopl receiv wildfir evacu order california,1
4,got sent photo rubi alaska smoke wildfir pour ...,1


In [16]:
## reduces the words to the radical
stemmer = SnowballStemmer("english")
tweets['text'] = tweets['text'].apply(stemming)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


# 3.0 - Machine Learning

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [19]:
vectorizer = CountVectorizer(analyzer='word', binary=True)
vectorizer.fit(tweets['text'])

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [21]:
X = vectorizer.transform(tweets['text']).todense()
y = tweets['target'].values
X.shape, y.shape

((7613, 19272), (7613,))

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2020)

In [24]:
model = LogisticRegression()

In [25]:
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
pred = model.predict(X_test)
print(classification_report(pred,y_test))

              precision    recall  f1-score   support

           0       0.89      0.79      0.83       962
           1       0.69      0.83      0.76       561

    accuracy                           0.80      1523
   macro avg       0.79      0.81      0.80      1523
weighted avg       0.82      0.80      0.81      1523



In [43]:
## Try another aproach
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [51]:
def clean_text(msg):
    ## Remove punctuations
    no_punc = [char for char in msg if char not in string.punctuation]
    ## Join to string again
    no_punc = ''.join(no_punc)
    ## Remove stop words
    clean = [word for word in no_punc.split() if word.lower() not in stopwords.words('english')]
    
    
    return clean

In [52]:
pipeline = Pipeline([('bow', CountVectorizer(analyzer=clean_text)), 
                     ('tfidf',TfidfTransformer()), 
                     ('classifier', MultinomialNB())])

In [47]:
pipeline.fit(X_train, y_train)

AttributeError: 'matrix' object has no attribute 'lower'