# Disaster Tweet Classification

## Load data

In [27]:
import pandas as pd
import numpy as np
import os
os.chdir('/Users/younghun/Desktop/gitrepo/data/nlp-getting-started/')

In [28]:
tweet = pd.read_csv('train.csv', encoding='utf-8')
print(tweet.shape)
tweet.head()

(7613, 5)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [29]:
# missing values
tweet.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [30]:
# keyword 종류 보기
tweet['keyword'].unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

In [31]:
# location 종류 보기
print(tweet['location'].nunique())
tweet['location'].unique()[:10]

3341


array([nan, 'Birmingham', 'Est. September 2012 - Bristol', 'AFRICA',
       'Philadelphia, PA', 'London, UK', 'Pretoria', 'World Wide!!',
       'Paranaque City', 'Live On Webcam'], dtype=object)

- keyword 결측치는 61개라서 대체할 값을 찾든가 하자.
- location은 결측치가 너무 많을 뿐더러 disaster tweet인지 아닌지 예측하는 데에 영향이 별로 크지 않을 것 같아서 삭제하기로 하자.

In [32]:
del tweet['location']

## Replace missing values of 'keyword' with a appropriate value

In [33]:
# keyword의 클래스별 count 보기
tweet['keyword'].value_counts()[:50]

fatalities     45
deluge         42
armageddon     42
sinking        41
harm           41
damage         41
body%20bags    41
outbreak       40
twister        40
fear           40
collided       40
windstorm      40
evacuate       40
siren          40
famine         39
sunk           39
whirlwind      39
flames         39
wrecked        39
wreckage       39
sinkhole       39
collision      39
weapons        39
weapon         39
explosion      39
earthquake     39
hellfire       39
derailment     39
explode        38
injury         38
thunder        38
ambulance      38
hurricane      38
blaze          38
upheaval       38
flooding       38
oil%20spill    38
fire           38
typhoon        38
derailed       38
fatal          38
drowned        38
bombed         38
deaths         38
wreck          37
wounded        37
fatality       37
bioterror      37
mudslide       37
blizzard       37
Name: keyword, dtype: int64

- fatalities : 죽음
- deluge : 대홍수
- armageddon : 대결전
- damage : 피해
- ... => 모두 부정적인 어감의 단어들인 듯 하다

- **``pd.Series에 정규표현식 사용해서 문자열 처리하는 방법!``**

    - replace(to_replace=정규표현식)
    - str.replace에 정규표현식도 이용가능

In [34]:
# 우선 keyword의 결측치가 아닌값의 문자열 중에서 숫자, 특수문자를 모두 공백으로 대체하자.
tweet['keyword'] = tweet['keyword'].replace(to_replace='[^a-zA-Z]', value='',
                        regex=True)

In [35]:
# text에 있는 특수문, 숫자들 공백으로 바꾸어주기
tweet['text'] = tweet['text'].str.replace('[^a-zA-Z]', ' ')

In [36]:
# keyword가 결측치인 데이터들만 추출해보기
keyword_null = tweet[tweet['keyword'].isnull()]
keyword_null = keyword_null.reset_index(drop=True)
keyword_null.head()

Unnamed: 0,id,keyword,text,target
0,1,,Our Deeds are the Reason of this earthquake M...,1
1,4,,Forest fire near La Ronge Sask Canada,1
2,5,,All residents asked to shelter in place are ...,1
3,6,,people receive wildfires evacuation or...,1
4,7,,Just got sent this photo from Ruby Alaska as ...,1


- keyword가 결측치인 데이터가 61개밖에 되지 않기 때문에 각 데이터의 text를 보고 keyword 사람이 직접 추출하자.

In [37]:
# 반복문 돌려서 id - text mapping시켜서 추출
for idx, text in enumerate(range(len(keyword_null))):
    id = keyword_null.loc[idx, 'id']
    text = keyword_null.loc[idx, 'text']
    print(f"#index number : {idx}, Tweet id : {id}")
    print(f"Content :\n{text}")
    print('-'*80)

#index number : 0, Tweet id : 1
Content :
Our Deeds are the Reason of this  earthquake May ALLAH Forgive us all
--------------------------------------------------------------------------------
#index number : 1, Tweet id : 4
Content :
Forest fire near La Ronge Sask  Canada
--------------------------------------------------------------------------------
#index number : 2, Tweet id : 5
Content :
All residents asked to  shelter in place  are being notified by officers  No other evacuation or shelter in place orders are expected
--------------------------------------------------------------------------------
#index number : 3, Tweet id : 6
Content :
       people receive  wildfires evacuation orders in California 
--------------------------------------------------------------------------------
#index number : 4, Tweet id : 7
Content :
Just got sent this photo from Ruby  Alaska as smoke from  wildfires pours into a school 
--------------------------------------------------------------------

In [38]:
keyword_dict = {
    1:'earthquake',4:'fire',5:'shelter evacuation',6:'wildfires evacuation',7:'wildfires',8:'wildfires',
    10:'flood disaster',13:'fire',14:'emergency evacuation',15:'tornado',16:'die heat wave',17:'flood',
    18:'flood',19:'flood',20:'damage crash',23:'man',24:'love',25:'lovely',26:'fast',28:'goal',31:'ridiculous',
    32:'cool',33:'love', 34:'wonderful',36:'funny', 37:'shit', 38:'location', 39:'love', 40:'cool', 41:'like',
    44:'end', 10835:'bomb', 10837:'eplode', 10839:'flood', 10840:'seismic earthquake', 10841:'siren',
    10842:'attack kill', 10843:'earthquake',10844:'typhoon', 10846:'heat wave warning', 10847:'bomber',
    10848:'loud', 10849:'explode', 10850:'flood', 10851:'thunderstorm', 10852:'debris', 10853:'collide',
    10854:'earthquake', 10855:'evacuation',10859:'break', 10860:'siren tornado warning', 10862:'quarantine',
    10863:'fire',10864:'bomb',10866:'suicide bomb', 10867:'storm', 10869:'collapse', 10870:'fire', 10871:'volcano',
    10872:'injury', 10873:'wildfire'
}

In [39]:
for index, id in enumerate(tweet['id']):
    if tweet.loc[index, 'id'] in keyword_dict.keys():
        tweet.loc[index, 'keyword'] = keyword_dict[id]

In [40]:
# keyword랑 text 합쳐주어서 텍스트로 만들기
tweet['text'] = tweet['keyword'] + ' ' + tweet['text']
del tweet['keyword']
tweet.head()

Unnamed: 0,id,text,target
0,1,earthquake Our Deeds are the Reason of this e...,1
1,4,fire Forest fire near La Ronge Sask Canada,1
2,5,shelter evacuation All residents asked to she...,1
3,6,wildfires evacuation people receive wi...,1
4,7,wildfires Just got sent this photo from Ruby ...,1


In [41]:
# id 변수도 이제 필요없으니 삭제
del tweet['id']

In [42]:
# CountVecotrizer or Tf-idf Vectorizer로 텍스트 벡터 피처화시키기

Unnamed: 0,text,target
0,earthquake Our Deeds are the Reason of this e...,1
1,fire Forest fire near La Ronge Sask Canada,1
2,shelter evacuation All residents asked to she...,1
3,wildfires evacuation people receive wi...,1
4,wildfires Just got sent this photo from Ruby ...,1
...,...,...
7608,collapse Two giant cranes holding a bridge col...,1
7609,fire aria ahrary TheTawniest The out of cont...,1
7610,volcano M UTC km S of Volcano Haw...,1
7611,injury Police investigating after an e bike co...,1
