In [76]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [77]:
landslide_data = pd.read_csv('landslide_data.csv')

In [78]:
landslide_data.dtypes

source_name                   object
source_link                   object
event_id                       int64
event_date                    object
event_time                   float64
event_title                   object
event_description             object
location_description          object
location_accuracy             object
landslide_category            object
landslide_trigger             object
landslide_size                object
landslide_setting             object
fatality_count               float64
injury_count                 float64
storm_name                    object
photo_link                    object
notes                         object
event_import_source           object
event_import_id              float64
country_name                  object
country_code                  object
admin_division_name           object
admin_division_population    float64
gazeteer_closest_point        object
gazeteer_distance            float64
submitted_date                object
c

In [79]:
landslide_data.shape
used_columns = ['timestamp', 'event_source', 'landslide_category', 'landslide_trigger', 'landslide_size', 'landslide_setting', 'fatality_count', 'injury_count', 'country_code', 'longitude', 'latitude'] 

(11033, 31)

In [80]:
landslide_data['timestamp'] = pd.to_datetime(landslide_data['event_date'], format='%m/%d/%Y %H:%M:%S %p')

In [81]:
landslide_data['timestamp']

0       2008-08-01 12:00:00
1       2009-01-02 02:00:00
2       2007-01-19 12:00:00
3       2009-07-31 12:00:00
4       2010-10-16 12:00:00
                ...        
11028   2017-04-01 01:34:00
11029   2017-03-25 05:32:00
11030   2016-12-15 05:00:00
11031   2017-04-29 07:03:00
11032   2017-03-13 02:32:00
Name: timestamp, Length: 11033, dtype: datetime64[ns]

In [82]:
landslide_data['landslide_category'] = landslide_data['landslide_category'].astype('category')

In [83]:
landslide_data['landslide_category'].describe()

count         11032
unique           14
top       landslide
freq           7648
Name: landslide_category, dtype: object

In [84]:
landslide_data['source_name'].nunique()
landslide_data.shape

(11033, 32)

In [85]:
df1 = landslide_data.groupby(['source_name']).size().sort_values(ascending=False).reset_index(name="count")
df1.head(30)

Unnamed: 0,source_name,count
0,Oregon DOT,768
1,maps.google.com,104
2,thehimalayantimes,75
3,news.xinhuanet,74
4,newsinfo.inquirer,71
5,thejakartapost,59
6,ibnlive.in,57
7,Times of India,47
8,The Jakarta Post,46
9,The Himalayan Times,43


In [86]:
major_sources = set(df1['source_name'].iloc[:20])

In [87]:
print(major_sources)

{'Red Cross - Field reports', 'Hindustan Times', 'maps.google.com', 'The Jakarta Post', 'thejakartapost', 'google', 'thehimalayantimes', 'colombiareports.com', 'news.xinhuanet', 'Times of India', 'GMA News', 'newsinfo.inquirer', 'laht', 'Oregon DOT', 'Seattle Times', 'The Hindu', 'ibnlive.in', 'reliefweb', 'articles.timesofindia.indiatimes.com', 'The Himalayan Times'}


In [88]:
def getEventSource(row):
    source = 'Others'
    if row['source_name'] in major_sources:
        source = row['source_name']
    row['event_source'] = source
    return row
    

df2 = landslide_data.apply(getEventSource, axis=1)

In [90]:
df2[df2['source_name'] == 'google'].head(1)

Unnamed: 0,source_name,source_link,event_id,event_date,event_time,event_title,event_description,location_description,location_accuracy,landslide_category,...,admin_division_population,gazeteer_closest_point,gazeteer_distance,submitted_date,created_date,last_edited_date,longitude,latitude,timestamp,event_source
553,google,http://www.google.com/hostednews/canadianpress...,2410,09/10/2010 12:00:00 AM,,"Atrani village, less then 1 mi from Amalfi","Sep 10, 2010 ROME — A river of mud unleashed ...","Atrani village, less then 1 mi from Amalfi",1km,mudslide,...,887.0,Atrani,0.24389,04/01/2014 12:00:00 AM,11/20/2017 03:17:00 PM,02/15/2018 03:51:00 PM,14.6074,40.6378,2010-09-10 12:00:00,google


In [91]:
landslide_data['event_title'].nunique()

10549

In [99]:
unused_columns = ['event_title', 'event_description', 'location_description', 'location_accuracy', 'storm_name', 'photo_link', 'notes', 'event_import_source', 'event_import_id', 'country_name']

In [93]:
landslide_data['landslide_category'].describe()

count         11032
unique           14
top       landslide
freq           7648
Name: landslide_category, dtype: object

In [94]:
landslide_data['landslide_trigger'].describe()

count        11010
unique          18
top       downpour
freq          4680
Name: landslide_trigger, dtype: object

In [95]:
landslide_data['landslide_size'].describe()

count      11024
unique         6
top       medium
freq        6551
Name: landslide_size, dtype: object

In [96]:
landslide_data['landslide_setting'].describe()

count       10964
unique         14
top       unknown
freq         6291
Name: landslide_setting, dtype: object

In [97]:
df2 = landslide_data.groupby(['landslide_setting']).size().sort_values(ascending=False).reset_index(name="count")
df2.head(30)

Unnamed: 0,landslide_setting,count
0,unknown,6291
1,above_road,3104
2,natural_slope,531
3,urban,264
4,below_road,199
5,mine,157
6,above_river,149
7,deforested_slope,53
8,other,50
9,bluff,48


In [98]:
landslide_data.isna().sum()

source_name                      0
source_link                    846
event_id                         0
event_date                       0
event_time                   11033
event_title                      0
event_description              862
location_description           102
location_accuracy                2
landslide_category               1
landslide_trigger               23
landslide_size                   9
landslide_setting               69
fatality_count                1385
injury_count                  5674
storm_name                   10456
photo_link                    9537
notes                        10716
event_import_source           1563
event_import_id               1562
country_name                  1562
country_code                  1564
admin_division_name           1637
admin_division_population     1562
gazeteer_closest_point        1563
gazeteer_distance             1562
submitted_date                  10
created_date                     1
last_edited_date    