# Working With Types
## Casting Types & Missing Values

In [1]:
import pandas as pd
houses = pd.read_csv("data/kc_house_data.csv")
titanic = pd.read_csv("data/titanic.csv")
netflix = pd.read_csv("data/netflix_titles.csv", sep="|", index_col=0)
btc = pd.read_csv("data/coin_Bitcoin.csv")
countries = pd.read_csv("data/world-happiness-report-2021.csv")

## Casting With astype()

In [2]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pclass     1309 non-null   int64 
 1   survived   1309 non-null   int64 
 2   name       1309 non-null   object
 3   sex        1309 non-null   object
 4   age        1309 non-null   object
 5   sibsp      1309 non-null   int64 
 6   parch      1309 non-null   int64 
 7   ticket     1309 non-null   object
 8   fare       1309 non-null   object
 9   cabin      1309 non-null   object
 10  embarked   1309 non-null   object
 11  boat       1309 non-null   object
 12  body       1309 non-null   object
 13  home.dest  1309 non-null   object
dtypes: int64(4), object(10)
memory usage: 143.3+ KB


In [3]:
titanic["age"].value_counts()

?         263
24         47
22         43
21         41
30         40
         ... 
66          1
0.6667      1
76          1
67          1
26.5        1
Name: age, Length: 99, dtype: int64

In [4]:
# This gives us an error!!
titanic["age"].astype("float")

ValueError: could not convert string to float: '?'

In [None]:
titanic["age"].replace(['?'], [None], inplace=True)

In [None]:
titanic.age.value_counts(dropna=False)

In [None]:
titanic["age"].astype("float")

In [None]:
titanic.info()

In [None]:
titanic["age_float"] = titanic["age"].astype("float")

In [None]:
titanic

In [None]:
titanic.info()

In [None]:
titanic["age"] = titanic["age"].astype("float")

In [None]:
titanic.info()

In [None]:
titanic["age"].mean()

In [None]:
titanic["sex"].astype("category")

In [None]:
titanic["sex"] = titanic["sex"].astype("category")

In [None]:
titanic.info()

In [None]:
titanic["sex"] = "MALE"

In [None]:
titanic.info()

In [None]:
titanic["embarked"] = titanic["embarked"].astype('category')

In [None]:
titanic.info()

## Casting with pd.to_numeric()

In [None]:
titanic = pd.read_csv("data/titanic.csv")
titanic["age"].value_counts()

In [None]:
pd.to_numeric(titanic["age"], errors="coerce")

In [None]:
titanic["age"] = pd.to_numeric(titanic["age"], errors="coerce")

In [None]:
titanic.info()

In [None]:
titanic["age"].describe()

## isna() and dropna()

In [None]:
stats = pd.read_csv("data/game_stats.csv")

In [None]:
stats

In [None]:
stats.isna()

In [None]:
stats[stats["league"].isna()]

In [None]:
stats["assists"].dropna()

In [None]:
assists = stats["assists"]
assists.dropna(inplace=True)

In [None]:
assists

In [None]:
stats

In [None]:
stats.dropna()

# 

In [None]:
stats

In [None]:
stats = pd.read_csv("data/game_stats.csv")
stats

In [None]:
stats.dropna(how="all")

In [None]:
stats.dropna(subset=["league","points"])

In [None]:
stats.dropna(axis=1)

## Filling NA values with fillna()

In [None]:
stats

In [None]:
stats.fillna(0)

In [None]:
stats["league"].fillna("amateur", inplace=True)

In [None]:
stats

In [None]:
stats.fillna({"points": 0, "assists": "NONE"})

In [None]:
sales = pd.read_csv("data/sales.csv")

In [None]:
sales

In [None]:
sales["shipping_zip"].fillna(sales["billing_zip"], inplace=True)

In [None]:
sales

In [21]:
import pandas as pd
houses = pd.read_csv("data/kc_house_data.csv")
titanic = pd.read_csv("data/titanic.csv")
netflix = pd.read_csv("data/netflix_titles.csv", sep="|", index_col=0)
btc = pd.read_csv("data/coin_Bitcoin.csv")
countries = pd.read_csv("data/world-happiness-report-2021.csv")

In [22]:
 titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pclass     1309 non-null   int64 
 1   survived   1309 non-null   int64 
 2   name       1309 non-null   object
 3   sex        1309 non-null   object
 4   age        1309 non-null   object
 5   sibsp      1309 non-null   int64 
 6   parch      1309 non-null   int64 
 7   ticket     1309 non-null   object
 8   fare       1309 non-null   object
 9   cabin      1309 non-null   object
 10  embarked   1309 non-null   object
 11  boat       1309 non-null   object
 12  body       1309 non-null   object
 13  home.dest  1309 non-null   object
dtypes: int64(4), object(10)
memory usage: 143.3+ KB


In [23]:
titanic["age"].replace(["?"], [None], inplace=True)

In [24]:
titanic["age"].value_counts(dropna=False)

None      263
24         47
22         43
21         41
30         40
         ... 
66          1
0.6667      1
76          1
67          1
26.5        1
Name: age, Length: 99, dtype: int64

In [25]:
  titanic["age"]=titanic["age"].astype("float")


In [26]:
titanic["age"].mean()

29.8811345124283

In [27]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1309 non-null   object 
 9   cabin      1309 non-null   object 
 10  embarked   1309 non-null   object 
 11  boat       1309 non-null   object 
 12  body       1309 non-null   object 
 13  home.dest  1309 non-null   object 
dtypes: float64(1), int64(4), object(9)
memory usage: 143.3+ KB


In [28]:
titanic["sex"]=titanic["sex"].astype("category")

In [29]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1309 non-null   int64   
 1   survived   1309 non-null   int64   
 2   name       1309 non-null   object  
 3   sex        1309 non-null   category
 4   age        1046 non-null   float64 
 5   sibsp      1309 non-null   int64   
 6   parch      1309 non-null   int64   
 7   ticket     1309 non-null   object  
 8   fare       1309 non-null   object  
 9   cabin      1309 non-null   object  
 10  embarked   1309 non-null   object  
 11  boat       1309 non-null   object  
 12  body       1309 non-null   object  
 13  home.dest  1309 non-null   object  
dtypes: category(1), float64(1), int64(4), object(8)
memory usage: 134.5+ KB


In [31]:
titanic.embarked=titanic.embarked.astype("category")

In [32]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1309 non-null   int64   
 1   survived   1309 non-null   int64   
 2   name       1309 non-null   object  
 3   sex        1309 non-null   category
 4   age        1046 non-null   float64 
 5   sibsp      1309 non-null   int64   
 6   parch      1309 non-null   int64   
 7   ticket     1309 non-null   object  
 8   fare       1309 non-null   object  
 9   cabin      1309 non-null   object  
 10  embarked   1309 non-null   category
 11  boat       1309 non-null   object  
 12  body       1309 non-null   object  
 13  home.dest  1309 non-null   object  
dtypes: category(2), float64(1), int64(4), object(7)
memory usage: 125.7+ KB


In [33]:
titanic = pd.read_csv("data/titanic.csv")


In [38]:
titanic.age=pd.to_numeric(titanic.age, errors="coerce")

In [39]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1309 non-null   object 
 9   cabin      1309 non-null   object 
 10  embarked   1309 non-null   object 
 11  boat       1309 non-null   object 
 12  body       1309 non-null   object 
 13  home.dest  1309 non-null   object 
dtypes: float64(1), int64(4), object(9)
memory usage: 143.3+ KB


In [40]:
titanic.age.describe()

count    1046.000000
mean       29.881135
std        14.413500
min         0.166700
25%        21.000000
50%        28.000000
75%        39.000000
max        80.000000
Name: age, dtype: float64

In [42]:
 stats=pd.read_csv("data/game_stats.csv")

In [43]:
stats

Unnamed: 0,name,league,points,assists,rebounds
0,bob,nba,22.0,5.0,10.0
1,jessie,,10.0,,2.0
2,stu,euroleague,,,
3,jackson,aba,9.0,,2.0
4,timothee,,8.0,,
5,steph,nba,49.0,8.0,10.0
6,,,,,


In [44]:
stats.isna()

Unnamed: 0,name,league,points,assists,rebounds
0,False,False,False,False,False
1,False,True,False,True,False
2,False,False,True,True,True
3,False,False,False,True,False
4,False,True,False,True,True
5,False,False,False,False,False
6,True,True,True,True,True


In [48]:
stats[~stats.assists.isna()]

Unnamed: 0,name,league,points,assists,rebounds
0,bob,nba,22.0,5.0,10.0
5,steph,nba,49.0,8.0,10.0


In [50]:
stats.assists.dropna()

0    5.0
5    8.0
Name: assists, dtype: float64

In [51]:
stats.dropna()

Unnamed: 0,name,league,points,assists,rebounds
0,bob,nba,22.0,5.0,10.0
5,steph,nba,49.0,8.0,10.0


In [52]:
stats.dropna(how="all")

Unnamed: 0,name,league,points,assists,rebounds
0,bob,nba,22.0,5.0,10.0
1,jessie,,10.0,,2.0
2,stu,euroleague,,,
3,jackson,aba,9.0,,2.0
4,timothee,,8.0,,
5,steph,nba,49.0,8.0,10.0


In [53]:
stats

Unnamed: 0,name,league,points,assists,rebounds
0,bob,nba,22.0,5.0,10.0
1,jessie,,10.0,,2.0
2,stu,euroleague,,,
3,jackson,aba,9.0,,2.0
4,timothee,,8.0,,
5,steph,nba,49.0,8.0,10.0
6,,,,,


In [55]:
stats.dropna(subset=["league","points"])

Unnamed: 0,name,league,points,assists,rebounds
0,bob,nba,22.0,5.0,10.0
3,jackson,aba,9.0,,2.0
5,steph,nba,49.0,8.0,10.0


In [56]:
stats.dropna(axis=1)

0
1
2
3
4
5
6


In [57]:
stats.fillna(0)

Unnamed: 0,name,league,points,assists,rebounds
0,bob,nba,22.0,5.0,10.0
1,jessie,0,10.0,0.0,2.0
2,stu,euroleague,0.0,0.0,0.0
3,jackson,aba,9.0,0.0,2.0
4,timothee,0,8.0,0.0,0.0
5,steph,nba,49.0,8.0,10.0
6,0,0,0.0,0.0,0.0


In [59]:
stats.league.fillna("amateur")

0           nba
1       amateur
2    euroleague
3           aba
4       amateur
5           nba
6       amateur
Name: league, dtype: object

In [60]:
stats.fillna({"points":0,"assists":"NONE"})

Unnamed: 0,name,league,points,assists,rebounds
0,bob,nba,22.0,5.0,10.0
1,jessie,,10.0,NONE,2.0
2,stu,euroleague,0.0,NONE,
3,jackson,aba,9.0,NONE,2.0
4,timothee,,8.0,NONE,
5,steph,nba,49.0,8.0,10.0
6,,,0.0,NONE,


In [61]:
sales=pd.read_csv("data/sales.csv")

In [62]:
sales

Unnamed: 0,rating,shipping_zip,billing_zip
0,5.0,,81220.0
1,4.5,94931.0,94931.0
2,,92625.0,92625.0
3,4.5,10003.0,10003.0
4,4.0,,92660.0
5,,,
6,,60007.0,60007.0


In [69]:
sales.shipping_zip.fillna(sales.billing_zip, inplace=True)

In [70]:
sales

Unnamed: 0,rating,shipping_zip,billing_zip
0,5.0,81220.0,81220.0
1,4.5,94931.0,94931.0
2,,92625.0,92625.0
3,4.5,10003.0,10003.0
4,4.0,92660.0,92660.0
5,,,
6,,60007.0,60007.0
