# Cleaning Data

## First Inspection / Handling Inconsistent Data

In [1]:
import pandas as pd

### Titanic Dataset

In [2]:
titanic = pd.read_csv("titanic.csv")

In [3]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [4]:
titanic.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
886,0,2,male,27.0,0,0,13.0,S,
887,1,1,female,19.0,0,0,30.0,S,B
888,0,3,female,,1,2,23.45,S,
889,1,1,male,26.0,0,0,30.0,C,C
890,0,3,male,32.0,0,0,7.75,Q,


In [5]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
 8   deck      203 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [6]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
titanic[["survived", "sex", "age", "fare"]].describe()

Unnamed: 0,survived,age,fare
count,891.0,714.0,891.0
mean,0.383838,29.699118,32.204208
std,0.486592,14.526497,49.693429
min,0.0,0.42,0.0
25%,0.0,20.125,7.9104
50%,0.0,28.0,14.4542
75%,1.0,38.0,31.0
max,1.0,80.0,512.3292


In [8]:
titanic.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

In [9]:
titanic.survived.replace(to_replace= [0, 1], value = ["no", "yes"], inplace = True)

In [10]:
titanic.survived.value_counts()

no     549
yes    342
Name: survived, dtype: int64

In [11]:
titanic.rename(columns = {"sex": "gender"}, inplace = True)

In [12]:
titanic.head()

Unnamed: 0,survived,pclass,gender,age,sibsp,parch,fare,embarked,deck
0,no,3,male,22.0,1,0,7.25,S,
1,yes,1,female,38.0,1,0,71.2833,C,C
2,yes,3,female,26.0,0,0,7.925,S,
3,yes,1,female,35.0,1,0,53.1,S,C
4,no,3,male,35.0,0,0,8.05,S,


In [13]:
titanic.iloc[:, 1:].describe()

Unnamed: 0,pclass,age,sibsp,parch,fare
count,891.0,714.0,891.0,891.0,891.0
mean,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.42,0.0,0.0,0.0
25%,2.0,20.125,0.0,0.0,7.9104
50%,3.0,28.0,0.0,0.0,14.4542
75%,3.0,38.0,1.0,0.0,31.0
max,3.0,80.0,8.0,6.0,512.3292


## String Operations

In [14]:
titanic.head()

Unnamed: 0,survived,pclass,gender,age,sibsp,parch,fare,embarked,deck
0,no,3,male,22.0,1,0,7.25,S,
1,yes,1,female,38.0,1,0,71.2833,C,C
2,yes,3,female,26.0,0,0,7.925,S,
3,yes,1,female,35.0,1,0,53.1,S,C
4,no,3,male,35.0,0,0,8.05,S,


In [15]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    object 
 1   pclass    891 non-null    int64  
 2   gender    891 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
 8   deck      203 non-null    object 
dtypes: float64(2), int64(3), object(4)
memory usage: 62.8+ KB


In [16]:
pd.to_numeric(titanic.fare)

0       7.2500
1      71.2833
2       7.9250
3      53.1000
4       8.0500
        ...   
886    13.0000
887    30.0000
888    23.4500
889    30.0000
890     7.7500
Name: fare, Length: 891, dtype: float64

In [17]:
titanic.deck.str.replace("$", "")

  titanic.deck.str.replace("$", "")


0      NaN
1        C
2      NaN
3        C
4      NaN
      ... 
886    NaN
887      B
888    NaN
889      C
890    NaN
Name: deck, Length: 891, dtype: object

In [18]:
summer = pd.read_csv()

TypeError: read_csv() missing 1 required positional argument: 'filepath_or_buffer'

In [None]:
summer = pd.read_csv("summer.csv")

In [None]:
summer.head(20)

In [None]:
summer.info()

In [None]:
summer.Athlete = summer.Athlete.str.title()
summer

In [None]:
summer.loc[summer.Athlete == "Hajos, Alfred"]

In [None]:
summer.iloc[0, 4]

In [None]:
summer.Athlete.str.strip()

removes whitespace

In [None]:
titanic.info()

In [None]:
titanic.survived.replace(to_replace= ["no", "yes"], value = [0, 1], inplace = True)

In [None]:
titanic

In [None]:
titanic.fare.astype("float")

changes datatype to whatever

In [None]:
titanic["fare"] = titanic.fare.astype("float")

In [None]:
titanic["survived"] = titanic.survived.astype("int")

## Intro to NA Values

In [20]:
import numpy as np

In [21]:
sales = pd.read_csv("sales.csv", index_col = 0)

In [22]:
sales

Unnamed: 0,Mon,Tue,Wed,Thu,Fri
Steven,34,27,15,,33
Mike,45,9,74,87.0,12
Andi,17,33,54,8.0,29
Paul,87,67,27,45.0,7


In [23]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, Steven to Paul
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Mon     4 non-null      int64  
 1   Tue     4 non-null      int64  
 2   Wed     4 non-null      int64  
 3   Thu     3 non-null      float64
 4   Fri     4 non-null      int64  
dtypes: float64(1), int64(4)
memory usage: 192.0+ bytes


In [25]:
sales.loc["Steven", "Thu"]

nan

In [27]:
# sales.iloc[1,1] = None   
# this can produce missing values manually

In [28]:
# sales.iloc[2,2] = np.nan
# this can as well

In [29]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, Steven to Paul
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Mon     4 non-null      int64  
 1   Tue     4 non-null      int64  
 2   Wed     4 non-null      int64  
 3   Thu     3 non-null      float64
 4   Fri     4 non-null      int64  
dtypes: float64(1), int64(4)
memory usage: 364.0+ bytes


In [30]:
sales

Unnamed: 0,Mon,Tue,Wed,Thu,Fri
Steven,34,27,15,,33
Mike,45,9,74,87.0,12
Andi,17,33,54,8.0,29
Paul,87,67,27,45.0,7


In [31]:
sales.iloc[0,3] = 0

In [36]:
sales.Thu = sales.Thu.astype("int")