In [8]:
#import required stuff
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

import re #for working with regular expression
import nltk #for natural language processing (nlp)
import spacy #also for nlp
import string #This is a module, Python also has built-in class str, these are different

In [35]:
train_df = pd.read_csv(r'/Users/vishalverma/Vishal/Kaggle/NLP on Tweets/Data/train.csv')
test_df = pd.read_csv(r'/Users/vishalverma/Vishal/Kaggle/NLP on Tweets/Data/test.csv')
sample_sub1 = pd.read_csv(r'/Users/vishalverma/Vishal/Kaggle/NLP on Tweets/Data/sample_submission.csv')

In [10]:
# Merging train and test dataframe for performing text-preprocessing
train_df_copy = train_df
train_df = train_df.drop('target', axis = 1)
frames = [train_df,test_df]
train_df = pd.concat(frames)

In [12]:
# Converting everything in Lower case
train_df['lowered_text'] = train_df['text'].str.lower()

In [13]:
# Removing punctuation
punctuation=string.punctuation
mapping=str.maketrans("","",punctuation)

def remove_punctuation(in_str):
    return in_str.translate(mapping)

print(train_df['lowered_text'].head(10))   
train_df['lowered_text']=train_df["lowered_text"].apply(lambda x: remove_punctuation(x))
print(train_df['lowered_text'].head(10)) 

0    our deeds are the reason of this #earthquake m...
1               forest fire near la ronge sask. canada
2    all residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    just got sent this photo from ruby #alaska as ...
5    #rockyfire update => california hwy. 20 closed...
6    #flood #disaster heavy rain causes flash flood...
7    i'm on top of the hill and i can see a fire in...
8    there's an emergency evacuation happening now ...
9    i'm afraid that the tornado is coming to our a...
Name: lowered_text, dtype: object
0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to shelter in place are be...
3    13000 people receive wildfires evacuation orde...
4    just got sent this photo from ruby alaska as s...
5    rockyfire update  california hwy 20 closed in ...
6    flood disaster heavy rain causes flash floodin...
7    im on top of the hill and 

In [14]:
# Removing Stop words
from nltk.corpus import stopwords
stopwords_eng=stopwords.words('english')

print(train_df["lowered_text"].head(10)) #before

def remove_stopwords(in_str):
    new_str=''
    words=in_str.split()
    for tx in words:
        if tx not in stopwords_eng:
            new_str=new_str + tx + " "
    return new_str

train_df['lowered_text_stop_removed']=train_df["lowered_text"].apply(lambda x: remove_stopwords(x))
print(train_df["lowered_text_stop_removed"].head(10))

0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to shelter in place are be...
3    13000 people receive wildfires evacuation orde...
4    just got sent this photo from ruby alaska as s...
5    rockyfire update  california hwy 20 closed in ...
6    flood disaster heavy rain causes flash floodin...
7    im on top of the hill and i can see a fire in ...
8    theres an emergency evacuation happening now i...
9     im afraid that the tornado is coming to our area
Name: lowered_text, dtype: object
0        deeds reason earthquake may allah forgive us 
1               forest fire near la ronge sask canada 
2    residents asked shelter place notified officer...
3    13000 people receive wildfires evacuation orde...
4    got sent photo ruby alaska smoke wildfires pou...
5    rockyfire update california hwy 20 closed dire...
6    flood disaster heavy rain causes flash floodin...
7                          im t

In [15]:
# Removing most frequent 10 words
from collections import Counter
counter=Counter()
for text in train_df["lowered_text_stop_removed"]:
    for word in text.split():
        counter[word]+=1
most_cmn_list=counter.most_common(10)
print(type(most_cmn_list), most_cmn_list)
most_cmn_words_list=[]
for word, freq in most_cmn_list:
    most_cmn_words_list.append(word)
print('Most common words : ', most_cmn_words_list)

def remove_frequent(in_str):
    new_str=''
    for word in in_str.split():
        if word not in most_cmn_words_list:
            new_str=new_str + word + " "
    return new_str

train_df["lowered_text_stop_removed_freq_removed"]=train_df['lowered_text_stop_removed'].apply(lambda x: remove_frequent(x))

<class 'list'> [('like', 490), ('amp', 434), ('im', 419), ('fire', 357), ('get', 335), ('new', 326), ('via', 324), ('news', 282), ('people', 278), ('one', 277)]
Most common words :  ['like', 'amp', 'im', 'fire', 'get', 'new', 'via', 'news', 'people', 'one']


In [16]:
# Removing 10 most rare words
most_rare_list=counter.most_common()[-10:]
most_rare_words=[]
for word, freq in most_rare_list:
    most_rare_words.append(word)
print('Most rare words : ',most_rare_words)

def remove_rare(in_text):
    new_text=""
    for word in in_text.split():
        if word not in most_rare_words:
            new_text=new_text + word + " "
    return new_text

train_df["lowered_stop_freq_rare_removed"]=train_df["lowered_text_stop_removed_freq_removed"].apply(lambda x: remove_rare(x))

Most rare words :  ['httptcotjpylu9fox', 'httptcopfavw5qyqe', 'httptcohkut5msdtp', 'issuicide', 'rajman', 'hasaka', 'risen', 'fasteners', 'xrwn', 'httptcoutbxlcbiuy']


In [17]:
#lemmatizer

from nltk.stem import WordNetLemmatizer
lem=WordNetLemmatizer()

def do_lemmatizing(in_str):
    new_str=""
    for word in in_str.split():
        new_str=new_str + lem.lemmatize(word) + " "
    return new_str

train_df["Lemmatized"]=train_df["lowered_stop_freq_rare_removed"].apply(lambda x: do_lemmatizing(x))

In [18]:
# Removing URLs
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)
def remove_html(in_str):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', in_str)

train_df["urls_removed"]=train_df["Lemmatized"].apply(lambda x: remove_urls(x))
train_df["html_removed"]=train_df["urls_removed"].apply(lambda x: remove_html(x))

In [19]:
# Converting chat words to actual text
chat_words_str = """
AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=ILU: I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
KISS=Keep It Simple, Stupid
LDR=Long Distance Relationship
LMAO=Laugh My A.. Off
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The A..
PRT=Party
PRW=Parents Are Watching
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My A.. Off
SK8=Skate
STATS=Your sex and age
ASL=Age, Sex, Location
THX=Thank You
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The F...
WTG=Way To Go!
WUF=Where Are You From?
W8=Wait...
7K=Sick:-D Laugher
"""

chat_words_expanded_dict = {}
chat_words_list = []
for line in chat_words_str.split("\n"):
    if line != "":
        chat_word = line.split("=")[0]
        chat_word_expanded = line.split("=")[1]
        chat_words_list.append(chat_word)
        chat_words_expanded_dict[chat_word] = chat_word_expanded
chat_words_list = set(chat_words_list)

def convert_chat_words(in_str):
    new_str = ""
    for w in in_str.split():
        if w.upper() in chat_words_list:
            new_str = new_str + chat_words_expanded_dict[w.upper()] + " "
        else:
            new_str = new_str + w + " "
    return new_str

train_df["chat_words_coverted"]=train_df["html_removed"].apply(lambda x: convert_chat_words(x))

In [23]:
pip install pyspellchecker

Collecting pyspellchecker
  Using cached pyspellchecker-0.6.3-py3-none-any.whl (2.7 MB)
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.6.3
Note: you may need to restart the kernel to use updated packages.


In [27]:
train_df["spellings_corrected"]=train_df["chat_words_coverted"]

In [28]:
print(test_df.shape)
print(train_df.shape)
print(train_df_copy.shape)

(3263, 4)
(10876, 13)
(7613, 5)


In [29]:
# splitting dataframe into train_df and test_df
train_df_copy2 = train_df

test_df = train_df.iloc[7613:,:]
train_df = train_df.iloc[:7613,:]

test_df['text'] = train_df_copy2.iloc[7613:,:]['spellings_corrected']
train_df['text'] = train_df_copy2.iloc[:7613,:]['spellings_corrected']

print(test_df.shape)
print(train_df.shape)

train_df['target'] = train_df_copy['target'].values
# 7613
# 3263
train_df.head(5)

(3263, 13)
(7613, 13)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


Unnamed: 0,id,keyword,location,text,lowered_text,lowered_text_stop_removed,lowered_text_stop_removed_freq_removed,lowered_stop_freq_rare_removed,Lemmatized,urls_removed,html_removed,chat_words_coverted,spellings_corrected,target
0,1,,,deed reason earthquake may allah forgive You,our deeds are the reason of this earthquake ma...,deeds reason earthquake may allah forgive us,deeds reason earthquake may allah forgive us,deeds reason earthquake may allah forgive us,deed reason earthquake may allah forgive u,deed reason earthquake may allah forgive u,deed reason earthquake may allah forgive u,deed reason earthquake may allah forgive You,deed reason earthquake may allah forgive You,1
1,4,,,forest near la ronge sask canada,forest fire near la ronge sask canada,forest fire near la ronge sask canada,forest near la ronge sask canada,forest near la ronge sask canada,forest near la ronge sask canada,forest near la ronge sask canada,forest near la ronge sask canada,forest near la ronge sask canada,forest near la ronge sask canada,1
2,5,,,resident asked shelter place notified officer ...,all residents asked to shelter in place are be...,residents asked shelter place notified officer...,residents asked shelter place notified officer...,residents asked shelter place notified officer...,resident asked shelter place notified officer ...,resident asked shelter place notified officer ...,resident asked shelter place notified officer ...,resident asked shelter place notified officer ...,resident asked shelter place notified officer ...,1
3,6,,,13000 receive wildfire evacuation order califo...,13000 people receive wildfires evacuation orde...,13000 people receive wildfires evacuation orde...,13000 receive wildfires evacuation orders cali...,13000 receive wildfires evacuation orders cali...,13000 receive wildfire evacuation order califo...,13000 receive wildfire evacuation order califo...,13000 receive wildfire evacuation order califo...,13000 receive wildfire evacuation order califo...,13000 receive wildfire evacuation order califo...,1
4,7,,,got sent photo ruby alaska smoke wildfire pour...,just got sent this photo from ruby alaska as s...,got sent photo ruby alaska smoke wildfires pou...,got sent photo ruby alaska smoke wildfires pou...,got sent photo ruby alaska smoke wildfires pou...,got sent photo ruby alaska smoke wildfire pour...,got sent photo ruby alaska smoke wildfire pour...,got sent photo ruby alaska smoke wildfire pour...,got sent photo ruby alaska smoke wildfire pour...,got sent photo ruby alaska smoke wildfire pour...,1


In [30]:
print(test_df.shape)
print(train_df.shape)

(3263, 13)
(7613, 14)


In [31]:
count_vectorizer = feature_extraction.text.CountVectorizer()

In [32]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])
test_vectors = count_vectorizer.transform(test_df["text"])

In [33]:
print(train_vectors)
print(test_vectors)

  (0, 4164)	1
  (0, 16478)	1
  (0, 4880)	1
  (0, 13831)	1
  (0, 1104)	1
  (0, 5904)	1
  (0, 20821)	1
  (1, 5895)	1
  (1, 14603)	1
  (1, 12989)	1
  (1, 16991)	1
  (1, 17261)	1
  (1, 2767)	1
  (2, 16731)	1
  (2, 1528)	1
  (2, 17623)	2
  (2, 15725)	2
  (2, 14861)	1
  (2, 15015)	1
  (2, 5293)	1
  (2, 15200)	1
  (2, 5383)	1
  (3, 5293)	1
  (3, 15200)	1
  (3, 138)	1
  :	:
  (7610, 6790)	1
  (7610, 6)	1
  (7610, 19922)	1
  (7610, 11677)	1
  (7611, 2823)	1
  (7611, 15816)	1
  (7611, 16850)	1
  (7611, 12145)	1
  (7611, 13331)	1
  (7611, 17510)	1
  (7611, 12262)	1
  (7611, 19151)	1
  (7611, 3398)	1
  (7611, 4902)	2
  (7611, 15867)	1
  (7611, 18561)	1
  (7611, 14803)	1
  (7612, 20457)	1
  (7612, 2726)	1
  (7612, 7054)	1
  (7612, 13070)	1
  (7612, 14832)	1
  (7612, 756)	1
  (7612, 16423)	1
  (7612, 11591)	1
  (0, 2823)	1
  (0, 3751)	1
  (0, 6713)	1
  (0, 18959)	1
  (1, 3212)	1
  (1, 4426)	1
  (1, 4880)	1
  (1, 5316)	1
  (1, 6834)	1
  (1, 17161)	1
  (1, 18341)	1
  (2, 849)	1
  (2, 2785)	1
  (2, 578

In [34]:
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression(random_state=0)
clf_lr.fit(train_vectors, train_df["target"])

LogisticRegression(random_state=0)

In [37]:
sample_sub1["target"] = clf_lr.predict(test_vectors)
sample_sub1.to_csv("/Users/vishalverma/Vishal/Kaggle/NLP on Tweets/results/submission.csv", index=False)