In [1]:
import numpy as np
import pandas as pd 
import nltk
import matplotlib.pyplot as plt
import string



In [2]:
train = pd.read_csv("tweet_train.csv")
test = pd.read_csv("tweet_test.csv")

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Turjya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [5]:
train.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [6]:
train_copy = train
test_copy = test

In [7]:
#First, we need to start filtering out what is not needed to simplify the process, location and id are irrelevant
train_copy = train_copy.drop(['location'], axis=1)
train_copy = train_copy.drop(['id'], axis=1)

In [8]:
#We should also drop missing data that has missing information in keywords, only column left with missing values
train_copy = train_copy.dropna()

In [9]:
#Once we remove, we need to reset the index so that it starts back from 0 for future use
train_copy.reset_index(inplace=True)
train_copy

Unnamed: 0,index,keyword,text,target
0,31,ablaze,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
1,32,ablaze,We always try to bring the heavy. #metal #RT h...,0
2,33,ablaze,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
3,34,ablaze,Crying out for more! Set me ablaze,0
4,35,ablaze,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
...,...,...,...,...
7547,7578,wrecked,@jt_ruff23 @cameronhacker and I wrecked you both,0
7548,7579,wrecked,Three days off from work and they've pretty mu...,0
7549,7580,wrecked,#FX #forex #trading Cramer: Iger's 3 words tha...,0
7550,7581,wrecked,@engineshed Great atmosphere at the British Li...,0


In [10]:
#Splitting time
X = train_copy['text']
y = train_copy['target']

In [11]:
#setting to english
from nltk.corpus import stopwords
words = set(stopwords.words('english'))

In [12]:
from nltk.stem.snowball import SnowballStemmer
stem = SnowballStemmer('english')

In [13]:
#now we shall utilize a function for cleaning the text to strip it of all unessential parts
import re
def cleanse(input):
    #Remove all URLs
    input = re.sub('http\S+\s*', ' ', input)
    #Remove RT and cc
    input = re.sub('RT|cc', ' ', input)
    #Remove the digits
    input = re.sub(r'\d+', '', input)
    #Remove all hashtags
    input = re.sub('#\S+', '', input)
    #Removing any mentions and E-mails
    input = re.sub('@\S+', '  ', input) 
    #Removing any punctuations
    input = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', input) 
    #Instead of non-ASCII characters, we will leave a space
    input = re.sub(r'[^\x00-\x7f]',r' ', input) 
    #Any extra space shall be removed
    input = re.sub('\s+', ' ', input)
    #Lowercase for easier use for analysis
    input = "".join([char.lower() for char in input if char not in string.punctuation])
    #Remove any stopwords
    input = " ".join([word for word in str(input).split() if word not in words])
    #Apply stemming
    input = " ".join([stem.stem(word) for word in input.split()])
    return input

In [14]:
#We shall now use this function on our text
X = X.apply(lambda x: cleanse(x))

In [15]:
X

0                                   wholesal market ablaz
1                                   alway tri bring heavi
2                   break news nigeria flag set ablaz aba
3                                           cri set ablaz
4                     plus side look sky last night ablaz
                              ...                        
7547                                                wreck
7548    three day work pretti much wreck hahaha shouto...
7549                  cramer iger word wreck disney stock
7550    great atmospher british lion gig tonight hear ...
7551             cramer iger word wreck disney stock cnbc
Name: text, Length: 7552, dtype: object

In [17]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=6000)
X = vectorizer.fit_transform(X)
X = X.toarray()
X.shape

(7552, 6000)

In [18]:
#play around with parameters of random state and test size after finishing
#Now let us begin creating the model
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, random_state=42, test_size=0.2, shuffle=True)

In [19]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X,y)
xgb.score(X,y)

0.8518273305084746

In [20]:
#Now we test the model against our X_test
y_pred = xgb.predict(X_test)
y_pred

array([1, 1, 0, ..., 0, 1, 1])

In [21]:
#let us begin preparing the test data for use
X_test_data = test['text']

In [22]:
X_test_data

0                      Just happened a terrible car crash
1       Heard about #earthquake is different cities, s...
2       there is a forest fire at spot pond, geese are...
3                Apocalypse lighting. #Spokane #wildfires
4           Typhoon Soudelor kills 28 in China and Taiwan
                              ...                        
3258    EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259    Storm in RI worse than last hurricane. My city...
3260    Green Line derailment in Chicago http://t.co/U...
3261    MEG issues Hazardous Weather Outlook (HWO) htt...
3262    #CityofCalgary has activated its Municipal Eme...
Name: text, Length: 3263, dtype: object

In [23]:
X_test_data = vectorizer.fit_transform(X_test_data)
X_test_data = X_test_data.toarray()

In [24]:
y_pred_final = xgb.predict(X_test_data)
y_pred_final

array([0, 0, 0, ..., 0, 0, 0])

In [25]:
result = pd.DataFrame({'id':test_copy.id, 'target': y_pred_final})
result

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0
...,...,...
3258,10861,0
3259,10865,0
3260,10868,0
3261,10874,0


In [26]:
#save to csv
result.to_csv('tweetsubmission.csv',index=False)