In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
%matplotlib inline
sns.set_style('whitegrid')

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

In [4]:
file_train = './DataSets/nlp-disaster/train.csv'
file_test = './DataSets/nlp-disaster/test.csv'

In [5]:
train = pd.read_csv(file_train)
test = pd.read_csv(file_test)

## About Data
**id: a unique identifier for each tweet**  
**text: The text of a tweet**  
**keyword: A keyword from that tweet (although this may be blank!)**  
**location: The location the tweet was sent from (may also be blank)**  
**target: in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)**

# 1.0 - Exploratory Data Analysis

In [6]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
id          7613 non-null int64
keyword     7552 non-null object
location    5080 non-null object
text        7613 non-null object
target      7613 non-null int64
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [8]:
## Checking for NaN values
train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

# 2.0 Manipulation and Cleaning

In [9]:
## Get only no null columns
tweets = train[['text','target']]
tweets.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [10]:
def remove_punct(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_stop(text):
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    return " ".join(text)

def stemming(text):    
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text) 

In [11]:
## Remove Punctuation on text
tweets['text'] = tweets['text'].apply(remove_punct)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [12]:
## Remove Stop Words
sw = stopwords.words('english')
tweets['text'] = tweets['text'].apply(remove_stop)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
tweets.head()

Unnamed: 0,text,target
0,deeds reason earthquake may allah forgive us,1
1,forest fire near la ronge sask canada,1
2,residents asked shelter place notified officer...,1
3,13000 people receive wildfires evacuation orde...,1
4,got sent photo ruby alaska smoke wildfires pou...,1


In [13]:
## reduces the words to the radical
stemmer = SnowballStemmer("english")
tweets['text'] = tweets['text'].apply(stemming)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [1]:
tweets

NameError: name 'tweets' is not defined

# 3.0 - Machine Learning

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [15]:
vectorizer = CountVectorizer(analyzer='word', binary=True)
vectorizer.fit(tweets['text'])

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [17]:
X = vectorizer.transform(tweets['text']).todense()
y = tweets['target'].values
X.shape, y.shape

((7613, 19319), (7613,))

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2020)

In [20]:
model = LogisticRegression()

In [21]:
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
y_pred = model.predict(X_test)

In [27]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.79      0.89      0.84       849
           1       0.83      0.70      0.76       674

    accuracy                           0.80      1523
   macro avg       0.81      0.79      0.80      1523
weighted avg       0.81      0.80      0.80      1523



# 4.0 - Submission

In [29]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [33]:
tweets_test = test['text']

In [34]:
test_X = vectorizer.transform(tweets_test).todense()
test_X.shape

(3263, 19319)

In [35]:
pred = model.predict(test_X)

In [50]:
sub = test
sub.drop(['text','keyword','location'], axis=1,inplace=True)

In [52]:
sub.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1


In [53]:
sub['target'] = pred

In [54]:
sub.to_csv("./Submissions/submission-tweets.csv", index=False)