# Inspecting Data with Pandas

Pandas allows you to easily and quick inspect your data once it has been loaded. 

In [16]:
import pandas as pd

# import data
df = pd.read_csv('people_data.csv', thousands=',')

### View the first 5 rows

In [5]:
df.head()

Unnamed: 0,first,last,age,height_cm,weight_kg,income
0,James,Smith,18,75.7,66.9,1000
1,Jane,Watson,18,163.0,56.7,800
2,Adam,Miller,19,176.5,68.9,350
3,Sara,Thompson,19,163.3,58.0,980
4,Tom,Piper,20,168.5,57.5,2500


### View the last 5 rows

In [3]:
df.tail()

Unnamed: 0,first,last,age,height_cm,weight_kg,income
1,Jane,Watson,18,163.0,56.7,800
2,Adam,Miller,19,176.5,68.9,350
3,Sara,Thompson,19,163.3,58.0,980
4,Tom,Piper,20,168.5,57.5,2500
5,Carol,Winters,20,177.5,63.4,2950


### Display information about data types

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
first        6 non-null object
last         6 non-null object
age          6 non-null int64
height_cm    6 non-null float64
weight_kg    6 non-null float64
income       6 non-null int64
dtypes: float64(2), int64(2), object(2)
memory usage: 368.0+ bytes


### Summary Statistics

In [7]:
df.describe()

Unnamed: 0,age,height_cm,weight_kg,income
count,6.0,6.0,6.0,6.0
mean,19.0,154.083333,61.9,1430.0
std,0.894427,38.902669,5.250905,1039.807675
min,18.0,75.7,56.7,350.0
25%,18.25,163.075,57.625,845.0
50%,19.0,165.9,60.7,990.0
75%,19.75,174.5,66.025,2125.0
max,20.0,177.5,68.9,2950.0


### Checking for Duplicates

In [31]:
# add some duplicate records

data = {'first': ['James', 'Jane', 'Adam', 'Sara', 'Tom', 'Carol','Tom', 'Carol'], 
        'last': ['Smith', 'Watson', 'Miller', 'Thompson', 'Piper', 'Winters', 'Piper', 'Winters'], 
        'age': [18, 18, 19, 19, 20, 20, 20, 20], 
        'height_cm': [75.7, 163, 176.5, 163.3, 168.5, 177.5, 168.5, 177.5],
        'weight_kg': [66.9, 56.7, 68.9, 58, 57.5, 63.4, 57.5, 63.4],
        'income':['1,000', 800, 350, 980, '2,500', '2,950', '2,500', '2,950']}

# create a DataFrame
df = pd.DataFrame(data, columns = ['first', 'last', 'age', 'height_cm', 'weight_kg', 'income'])

df

Unnamed: 0,first,last,age,height_cm,weight_kg,income
0,James,Smith,18,75.7,66.9,1000
1,Jane,Watson,18,163.0,56.7,800
2,Adam,Miller,19,176.5,68.9,350
3,Sara,Thompson,19,163.3,58.0,980
4,Tom,Piper,20,168.5,57.5,2500
5,Carol,Winters,20,177.5,63.4,2950
6,Tom,Piper,20,168.5,57.5,2500
7,Carol,Winters,20,177.5,63.4,2950


In [22]:
# check for any duplicates
df.duplicated().any()

True

In [34]:
# to check which ids are duplicate you can do this
df.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
dtype: bool

In [36]:
# show duplicate data

df[df.duplicated()]

Unnamed: 0,first,last,age,height_cm,weight_kg,income
6,Tom,Piper,20,168.5,57.5,2500
7,Carol,Winters,20,177.5,63.4,2950


### Handling Missing or NaN data

In [43]:
# Create data with missing records

data = {'first': ['James', 'Jane', 'Adam', 'Sara', 'Tom', 'Carol'], 
        'last': ['Smith', 'Watson', 'Miller', 'Thompson', 'Piper', 'Winters'], 
        'age': [18, 18, 19, 19, 20, None], 
        'height_cm': [75.7, 163, 176.5, 163.3, 168.5, 177.5],
        'weight_kg': [66.9, 56.7, 68.9, None, None, 63.4],
        'income':['1,000', 800, 350, 980, '2,500', '2,950']}

# create a DataFrame
df = pd.DataFrame(data, columns = ['first', 'last', 'age', 'height_cm', 'weight_kg', 'income'])

df

Unnamed: 0,first,last,age,height_cm,weight_kg,income
0,James,Smith,18.0,75.7,66.9,1000
1,Jane,Watson,18.0,163.0,56.7,800
2,Adam,Miller,19.0,176.5,68.9,350
3,Sara,Thompson,19.0,163.3,,980
4,Tom,Piper,20.0,168.5,,2500
5,Carol,Winters,,177.5,63.4,2950


### Checking for missing data

In [49]:
df.isna()

Unnamed: 0,first,last,age,height_cm,weight_kg,income
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,True,False
4,False,False,False,False,True,False
5,False,False,True,False,False,False


### Fill NA 

In [51]:
# specify waht you want to fill the data with

df.fillna(value='')

Unnamed: 0,first,last,age,height_cm,weight_kg,income
0,James,Smith,18.0,75.7,66.9,1000
1,Jane,Watson,18.0,163.0,56.7,800
2,Adam,Miller,19.0,176.5,68.9,350
3,Sara,Thompson,19.0,163.3,,980
4,Tom,Piper,20.0,168.5,,2500
5,Carol,Winters,,177.5,63.4,2950


### Drop any Row with missing data

In [53]:
# have to specify inplace=True
# axis 0 or 'index', axis 1 or 'columns'

df.dropna(axis=0, how='any', inplace=True)

df

Unnamed: 0,first,last,age,height_cm,weight_kg,income
0,James,Smith,18.0,75.7,66.9,1000
1,Jane,Watson,18.0,163.0,56.7,800
2,Adam,Miller,19.0,176.5,68.9,350
