## Data wrangling/cleaning using Python:



In [1]:
import pandas as pd

In [2]:
# to import a file into a pandas DataFrame
data = pd.read_csv('merged_clean_ver1.csv')

# to display the dataframe
data

Unnamed: 0.1,Unnamed: 0,id,state,gender,median_home_val,median_household_income,ic4,hvp1,ic5,pobc1,pobc2,ic2,ic3,avggift,tcode,dob,domain,target_d
0,0,44060,FL,M,AAA896,392,520.0,7,21975,6,16,430.0,466,28.000000,1,1901,C2,100.0
1,1,96093,IL,M,537.00,365,473.0,0,19387,1,89,415.0,410,5.666667,0,0,T2,7.0
2,2,43333,FL,F,725.00,301,436.0,3,18837,11,17,340.0,361,4.111111,0,2501,C2,5.0
3,3,21885,NC,M,AAA1095,401,413.0,7,14014,1,74,407.0,399,27.277778,0,2208,T2,38.0
4,4,190108,FL,F,995.00,252,348.0,0,17991,5,6,280.0,316,6.000000,28,0,C2,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1943,1010,161838,CA,F,1953,304,380.0,47,13811,25,43,353.0,337,13.500000,0,4212,C2,14.0
1944,1011,161838,CA,F,1953,304,380.0,47,13811,25,43,353.0,337,13.500000,0,4212,C2,14.0
1945,1012,138311,AZ,Female,1708,437,684.0,36,29098,7,19,586.0,551,9.769231,2,1403,S1,20.0
1946,1013,123469,TX,M,561,493,540.0,1,16623,5,68,529.0,506,5.200000,0,0,T2,5.0


In [3]:
data.columns

Index(['Unnamed: 0', 'id', 'state', 'gender', 'median_home_val',
       'median_household_income', 'ic4', 'hvp1', 'ic5', 'pobc1', 'pobc2',
       'ic2', 'ic3', 'avggift', 'tcode', 'dob', 'domain', 'target_d'],
      dtype='object')

## Key concepts - 2
- Deleting columns
- Rearranging columns
- Filtering and subsetting

### deleting columns

In [None]:
# deleting columns
data = data.drop(['TCODE']) # Explain the argument axis, when axis is 0 and 1

In [None]:
data = data.drop(['TCODE'], axis=1) # hint: is TCODE present in columns?

In [None]:
data = data.drop(['tcode'], axis=1)

In [None]:
data.columns

### Rearranging columns

In [None]:
# Rearranging columns
data = data[['id', 'state', 'gender', 'median_home_val', 'median_household_income', 'ic2', 'ic3', 'ic4', 'ic5', 'avggift', 'domain', 'dob', 'target_d']]

In [None]:
data

### filtering and subsetting

In [None]:
# filtering and subsetting -- using conditions with DataFrame
data[data['gender']=='M']

In [None]:
data[data['gender'].isin(['M', 'F'])]

In [None]:
data[(data['gender']=='M') | (data['gender']=='F')]

In [None]:
data[data['target_d']>100]

## Key concepts - 3

- Reset index
- Working with indexes

In [None]:
data

### filter and reset the index

In [None]:
#filter and reset the index

# In this section again emphasize on the importance of playing with the code and checking the output

filtered = data[data['gender']=='M']  # Lets say that we are working on this filtered data

In [None]:
filtered

In [None]:
# filtered
filtered = filtered.reset_index(drop=True) # what will happen after resetting the index?

In [None]:
filtered

In [None]:
temp = filtered.copy()
temp.set_index('state') # This is a dummy case, but indexes should be unique and not nulls, usually auto-increments by 1

In [None]:
# Working with indexes
filtered[1:4]

In [None]:
filtered[['gender', 'ic2', 'ic3']][0:10]

In [None]:
filtered.loc[1:3]

In [None]:
filtered.loc[100]

In [None]:
filtered.iloc[1:3]

In [None]:
# now, working just on the indexes row,columns
filtered.iloc[1:10,0:4]

In [None]:
filtered.iloc[[1,2,3,4],[0,2,4]]

### handling null values

In [None]:
data['gender'].isnull()#.sum()

In [None]:
data.dropna()

In [None]:
data.fillna()

## Key concepts - 4

- Correcting data types
- Removing duplicates

### data types

In [4]:
data.dtypes

Unnamed: 0                   int64
id                           int64
state                       object
gender                      object
median_home_val             object
median_household_income      int64
ic4                        float64
hvp1                         int64
ic5                         object
pobc1                        int64
pobc2                        int64
ic2                        float64
ic3                          int64
avggift                    float64
tcode                        int64
dob                          int64
domain                      object
target_d                   float64
dtype: object

In [None]:
data._get_numeric_data()

In [None]:
data._get_bool_data()

In [None]:
data.select_dtypes('object')

###  correcting data types

In [None]:
# will this work? why/why not?
pd.to_numeric(data['median_home_val'])

In [None]:
data['median_home_val'] =  pd.to_numeric(data['median_home_val'], errors='coerce')

In [None]:
data

In [None]:
data['ic5'] =  pd.to_numeric(data['ic5'], errors='coerce')

In [None]:
data._get_numeric_data() # to check if 'median_home_val' and 'ic5' are now listed as numeric data

### Removing duplicates

In [None]:
# Removing duplicates
data = data.drop_duplicates()  # play around with the code, show them how to use keep argument

In [None]:
data

In [None]:
# temp = temp.drop_duplicates(subset=['state','gender', 'ic2', 'ic3'])
# if we want to remove duplicates based on some specific columns