In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

import string, nltk, re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')


In [2]:
# Accessing individual data in the data frame 

train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [3]:
train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [4]:
train = train.drop(columns = ['keyword','location'])
train.dtypes

id         int64
text      object
target     int64
dtype: object

In [5]:
train # Want to see whether the columns are dropped.

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...
7608,10869,Two giant cranes holding a bridge collapse int...,1
7609,10870,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,Police investigating after an e-bike collided ...,1


In [6]:
train.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7613 non-null   int64 
 1   text    7613 non-null   object
 2   target  7613 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 178.6+ KB


In [8]:
train.isnull().sum()

id        0
text      0
target    0
dtype: int64

In [9]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [10]:
test.sample(10)

Unnamed: 0,id,keyword,location,text
2651,8867,smoke,IL?MI,I need a smoke ??
2291,7660,panic,Louisiana the real La,Tried 2 4get\nFeelings of death\nInvading pani...
1960,6609,inundated,San Diego,So it doesn't mean inundated with tweets? @Dic...
2835,9412,survivors,Orlando,#orlando Survivors of Shanghai Ghetto reunite ...
2283,7625,pandemonium,Los Angeles,Pandemonium In Aba As Woman Delivers Baby With...
1064,3507,derailment,Worldwide,Google News - A Twin Train Derailment in India...
7,22,,,Hey! How are you?
39,124,accident,"All Motorways, UK",On the #M42 northbound between junctions J3 an...
2305,7706,panicking,"Brooklyn, NY",BizInsider: People are finally panicking about...
674,2196,catastrophic,Leanbox?,@_LalaDeviluke - consequences could have been ...


In [11]:
test.isnull().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

In [12]:
train['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [13]:
print(f"Tweets that does not contain information about disaster :\n\n {train.loc[train['target']==0,'text'][20:25].values}", end="\n")

print()

print(f"Tweets that contains information about disaster :\n\n {train.loc[train['target']==1,'text'][20:25].values}", end="\n")

Tweets that does not contain information about disaster :

 ['Ablaze for you Lord :D'
 'Check these out: http://t.co/rOI2NSmEJJ http://t.co/3Tj8ZjiN21 http://t.co/YDUiXEfIpE http://t.co/LxTjc87KLS #nsfw'
 "on the outside you're ablaze and alive\nbut you're dead inside"
 'Had an awesome time visiting the CFC head office the ancop site and ablaze. Thanks to Tita Vida for taking care of us ??'
 'SOOOO PUMPED FOR ABLAZE ???? @southridgelife']

Tweets that contains information about disaster :

 ['Deputies: Man shot before Brighton home set ablaze http://t.co/gWNRhMSO8k'
 'Man wife get six years jail for setting ablaze niece\nhttp://t.co/eV1ahOUCZA'
 'Police: Arsonist Deliberately Set Black Church In North CarolinaåÊAblaze http://t.co/pcXarbH9An'
 '#Kurds trampling on Turkmen flag later set it ablaze while others vandalized offices of Turkmen Front in #Diyala http://t.co/4IzFdYC3cg'
 'TRUCK ABLAZE : R21. VOORTREKKER AVE. OUTSIDE OR TAMBO INTL. CARGO SECTION. http://t.co/8kscqKfKkF']


In [16]:
def preprocessing(text):
    text = text.lower()
    pattern = re.compile('[^a-z]')
    words = nltk.word_tokenize(text)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    words = [PorterStemmer().stem(word) for word in words if word.lower() not in stop_words]
    preprocessed_text = ' '.join(words)
    return preprocessed_text

In [None]:
# update 