<img src="https://bit.ly/2VnXWr2" width="100" align="left">

# Project | Demonstration of Data Cleaning and Manipulation with Pandas

## Import Useful Libraries

In [1]:
import os
import numpy as np
import pandas as pd
import random
import operator
import re
import math

# The following code allows multiple outputs from notebook cells:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Read Data

In [2]:
data = pd.read_csv("/Users/Miguel/Documents/GitHub/Ironhack exercises/Modulo 1/project-pandas/your-code/data/GSAF5.csv", encoding= "ISO-8859-1", low_memory=False)

## Examine For Potential Issues

In [3]:
data.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993,,
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992,,
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991,,
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,...,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990,,
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989,,


### Missing Values

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5992 entries, 0 to 5991
Data columns (total 24 columns):
Case Number               5992 non-null object
Date                      5992 non-null object
Year                      5992 non-null int64
Type                      5992 non-null object
Country                   5949 non-null object
Area                      5590 non-null object
Location                  5496 non-null object
Activity                  5465 non-null object
Name                      5792 non-null object
Sex                       5425 non-null object
Age                       3311 non-null object
Injury                    5965 non-null object
Fatal (Y/N)               5973 non-null object
Time                      2779 non-null object
Species                   3058 non-null object
Investigator or Source    5977 non-null object
pdf                       5992 non-null object
href formula              5991 non-null object
href                      5989 non-null object
C

In [5]:
null_cols = data.isnull().sum(axis=0)

In [6]:
null_cols_percentage = null_cols [null_cols > 0]/5992*100
null_cols_percentage

Country                    0.717623
Area                       6.708945
Location                   8.277704
Activity                   8.795060
Name                       3.337784
Sex                        9.462617
Age                       44.742991
Injury                     0.450601
Fatal (Y/N)                0.317089
Time                      53.621495
Species                   48.965287
Investigator or Source     0.250334
href formula               0.016689
href                       0.050067
Unnamed: 22               99.983311
Unnamed: 23               99.966622
dtype: float64

In [7]:
null_col_indexes = list(null_cols_percentage [null_cols_percentage > 50].index)
null_col_indexes

['Time', 'Unnamed: 22', 'Unnamed: 23']

In [8]:
null_rows = data.isnull().sum(axis=1)

In [9]:
null_rows_percentage = null_rows [null_rows > 0]/24*100
null_rows_percentage

0       12.500000
1       12.500000
2       12.500000
3       20.833333
4       16.666667
          ...    
5987    20.833333
5988    25.000000
5989    20.833333
5990    29.166667
5991    16.666667
Length: 5992, dtype: float64

In [10]:
null_row_indexes = list(null_rows_percentage [null_rows_percentage > 50].index)
null_row_indexes

[]

## Drop columns with more than 50% null values

In [11]:
data = data.drop(null_col_indexes, axis = 1)

In [12]:
data.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Injury,Fatal (Y/N),Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,Minor injury to thigh,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,Lacerations to hands,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,Lacerations to lower leg,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,...,Struck by fin on chest & leg,N,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,No injury: Knocked off board by shark,N,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989


In [13]:
data.shape

(5992, 21)

## Drop rows that are still missing more than 33.33% values

In [14]:
null_rows = data.isnull().sum(axis=1)

In [15]:
null_rows_percentage = null_rows [null_rows > 0]/21*100

In [16]:
null_row_indexes = list(null_rows_percentage [null_rows_percentage > 33.33].index)
null_row_indexes

[4746, 4806, 5794, 5856]

In [17]:
data = data.drop(null_row_indexes, axis = 0)

In [18]:
data.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Injury,Fatal (Y/N),Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,Minor injury to thigh,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,Lacerations to hands,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,Lacerations to lower leg,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,...,Struck by fin on chest & leg,N,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,No injury: Knocked off board by shark,N,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989


In [19]:
data.shape

(5988, 21)

## Check On Similarly Named Columns

### Drop rows with mistakes in href & href formula

In [20]:
href_differences = data[(data["href formula"] == data["href"]) == False]
href_mistakes = (href_differences[["href formula","href"]].index)
href_mistakes

Int64Index([  20,   27,   61,  107,  114,  134,  180,  193,  232,  262,  263,
             264,  271,  272,  293,  305,  323,  347,  361,  362,  363,  364,
             365,  367,  377,  378,  379,  380,  381,  382,  383,  384,  448,
             449,  823, 1217, 1218, 2274, 2477, 3019, 3549, 3603, 3917, 3928,
            4394, 4642, 4668, 4719, 5317, 5458, 5686, 5694, 5819, 5857],
           dtype='int64')

In [21]:
data = data.drop(href_mistakes)

In [22]:
data.shape

(5934, 21)

### Drop rows with mistakes in "Case" columns

In [23]:
case_differences = data[((data["Case Number"] == data["Case Number.1"]) & (data["Case Number.1"] == data["Case Number.2"]) == False)]
case_mistakes = (case_differences[["Case Number","Case Number.1","Case Number.2"]].index)
case_mistakes

Int64Index([4, 33, 97, 116, 121, 169, 3296, 3569, 3654, 4177, 5043, 5150], dtype='int64')

In [24]:
data = data.drop(case_mistakes)

In [25]:
data.shape

(5922, 21)

## Fix Countries

## Rename Columns

In [None]:
data.columns

In [None]:
data = data.rename(columns={'Manufacturer':'Make',
                            'Displacement':'Engine Displacement'})
data.columns

## Data Type

In [None]:
data.dtypes

## Extreme Values And Outliers

In [None]:
data.describe()

## Low Variance Columns

In [None]:
low_variance = []
for col in data._get_numeric_data():
    minimum = min(data[col])
    ninety_percent = np.percentile(data[col], 90)
    if ninety_percent == minimum:
        low_variance.append(col)

In [None]:
print(low_variance)

## Duplicates removal

In [None]:
data.columns

In [None]:
select_columns = ['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
                  'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Species ',
                  'Investigator or Source', 'pdf', 'href formula', 'href',
                  'Case Number.1', 'Case Number.2', 'original order']
before = len(data)
data = data[select_columns].drop_duplicates()
after = len(data)
print('Number of duplicate records dropped: ', str(before - after))