# Cleaning Data

## First Inspection / Handling Inconsistent Data

In [1]:
import pandas as pd

### Titanic Dataset

In [2]:
titanic = pd.read_csv("titanic.csv")

In [3]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [4]:
titanic.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
886,0,2,male,27.0,0,0,13.0,S,
887,1,1,female,19.0,0,0,30.0,S,B
888,0,3,female,,1,2,23.45,S,
889,1,1,male,26.0,0,0,30.0,C,C
890,0,3,male,32.0,0,0,7.75,Q,


In [5]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
 8   deck      203 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [6]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
titanic[["survived", "sex", "age", "fare"]].describe()

Unnamed: 0,survived,age,fare
count,891.0,714.0,891.0
mean,0.383838,29.699118,32.204208
std,0.486592,14.526497,49.693429
min,0.0,0.42,0.0
25%,0.0,20.125,7.9104
50%,0.0,28.0,14.4542
75%,1.0,38.0,31.0
max,1.0,80.0,512.3292


In [8]:
titanic.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

In [9]:
titanic.survived.replace(to_replace= [0, 1], value = ["no", "yes"], inplace = True)

In [10]:
titanic.survived.value_counts()

no     549
yes    342
Name: survived, dtype: int64

In [11]:
titanic.rename(columns = {"sex": "gender"}, inplace = True)

In [12]:
titanic.head()

Unnamed: 0,survived,pclass,gender,age,sibsp,parch,fare,embarked,deck
0,no,3,male,22.0,1,0,7.25,S,
1,yes,1,female,38.0,1,0,71.2833,C,C
2,yes,3,female,26.0,0,0,7.925,S,
3,yes,1,female,35.0,1,0,53.1,S,C
4,no,3,male,35.0,0,0,8.05,S,


In [13]:
titanic.iloc[:, 1:].describe()

Unnamed: 0,pclass,age,sibsp,parch,fare
count,891.0,714.0,891.0,891.0,891.0
mean,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.42,0.0,0.0,0.0
25%,2.0,20.125,0.0,0.0,7.9104
50%,3.0,28.0,0.0,0.0,14.4542
75%,3.0,38.0,1.0,0.0,31.0
max,3.0,80.0,8.0,6.0,512.3292


## String Operations

In [14]:
titanic.head()

Unnamed: 0,survived,pclass,gender,age,sibsp,parch,fare,embarked,deck
0,no,3,male,22.0,1,0,7.25,S,
1,yes,1,female,38.0,1,0,71.2833,C,C
2,yes,3,female,26.0,0,0,7.925,S,
3,yes,1,female,35.0,1,0,53.1,S,C
4,no,3,male,35.0,0,0,8.05,S,


In [15]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    object 
 1   pclass    891 non-null    int64  
 2   gender    891 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
 8   deck      203 non-null    object 
dtypes: float64(2), int64(3), object(4)
memory usage: 62.8+ KB


In [16]:
pd.to_numeric(titanic.fare)

0       7.2500
1      71.2833
2       7.9250
3      53.1000
4       8.0500
        ...   
886    13.0000
887    30.0000
888    23.4500
889    30.0000
890     7.7500
Name: fare, Length: 891, dtype: float64

In [17]:
titanic.deck.str.replace("$", "")

  titanic.deck.str.replace("$", "")


0      NaN
1        C
2      NaN
3        C
4      NaN
      ... 
886    NaN
887      B
888    NaN
889      C
890    NaN
Name: deck, Length: 891, dtype: object

In [18]:
summer = pd.read_csv()

TypeError: read_csv() missing 1 required positional argument: 'filepath_or_buffer'

In [None]:
summer = pd.read_csv("summer.csv")

In [None]:
summer.head(20)

In [None]:
summer.info()

In [None]:
summer.Athlete = summer.Athlete.str.title()
summer

In [None]:
summer.loc[summer.Athlete == "Hajos, Alfred"]

In [None]:
summer.iloc[0, 4]

In [None]:
summer.Athlete.str.strip()

removes whitespace

In [None]:
titanic.info()

In [None]:
titanic.survived.replace(to_replace= ["no", "yes"], value = [0, 1], inplace = True)

In [None]:
titanic

In [None]:
titanic.fare.astype("float")

changes datatype to whatever

In [None]:
titanic["fare"] = titanic.fare.astype("float")

In [None]:
titanic["survived"] = titanic.survived.astype("int")

## Intro to NA Values

In [None]:
import numpy as np

In [None]:
sales = pd.read_csv("sales.csv", index_col = 0)

In [None]:
sales

In [None]:
sales.info()

In [None]:
sales.loc["Steven", "Thu"]

In [None]:
# sales.iloc[1,1] = None   
# this can produce missing values manually

In [None]:
# sales.iloc[2,2] = np.nan
# this can as well

In [None]:
sales.info()

In [None]:
sales

In [None]:
sales.iloc[0,3] = 0

In [None]:
sales.Thu = sales.Thu.astype("int")

In [None]:
sales

## Detecting Missing Values

In [None]:
titanic

In [None]:
titanic.info()

In [None]:
titanic.isna()

In [None]:
titanic.isna().sum(axis = 0)

In [None]:
titanic.isna().any(axis = 0)

In [None]:
titanic[titanic.isna().any(axis = 1)]

In [None]:
titanic.notna()

In [None]:
titanic.notna().sum(axis = 0)

In [None]:
titanic.notna().all(axis = 0)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize = (12, 8))
sns.heatmap(titanic.notna())
plt.show()

In [None]:
titanic.age.value_counts(dropna = False)

## Removing Missing Values with dropna()

In [None]:
titanic.head()

In [None]:
titanic.info()

In [None]:
titanic[titanic.embarked.isna()]

In [None]:
titanic.age.value_counts(dropna = False)

In [None]:
titanic.age.mean(skipna = True)

In [None]:
titanic.shape

In [None]:
titanic.dropna().shape

dropping all entries with missing data leaves us with 182 entries. that is too low.

In [None]:
titanic.dropna(axis = 0, how = "any").shape

rows with ANY missing value leaves us with 182 entries

In [None]:
titanic.dropna(axis = 1, how = "any").shape

dropping columns that have any missing values leaves us with 6 columns

In [None]:
titanic.dropna(axis = 0, how = "all").shape

dropping values with ALL missing values leaves us with the original 891

this means that no entries are fully empty

In [None]:
titanic.dropna(axis = 1, how = "all").shape

same for columns

In [None]:
titanic.dropna(axis = 0, thresh = 8).shape

dropping all rows with less than 8 non null values

In [None]:
titanic.dropna(axis = 1, thresh = 500).shape

dropping all columns with less than 500 non null values

dropped the deck column

In [None]:
titanic.dropna(axis = 1, thresh = 500, inplace = True)

In [None]:
titanic.head()

In [None]:
titanic.shape

In [None]:
titanic.dropna(axis = 0, subset = ["survived", "pclass", "gender", "age"], thresh = 4).shape

dropping all rows that do not have at least 4 (aka all) of the subset values

In [None]:
titanic.dropna(axis = 0, subset = ["survived", "pclass", "gender", "age"], how = "any").shape

another way to say this

## Replacing Missing Values

In [None]:
titanic = pd.read_csv("titanic.csv")

In [19]:
titanic.head(20)

Unnamed: 0,survived,pclass,gender,age,sibsp,parch,fare,embarked,deck
0,no,3,male,22.0,1,0,7.25,S,
1,yes,1,female,38.0,1,0,71.2833,C,C
2,yes,3,female,26.0,0,0,7.925,S,
3,yes,1,female,35.0,1,0,53.1,S,C
4,no,3,male,35.0,0,0,8.05,S,
5,no,3,male,,0,0,8.4583,Q,
6,no,1,male,54.0,0,0,51.8625,S,E
7,no,3,male,2.0,3,1,21.075,S,
8,yes,3,female,27.0,0,2,11.1333,S,
9,yes,2,female,14.0,1,0,30.0708,C,


In [20]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    object 
 1   pclass    891 non-null    int64  
 2   gender    891 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
 8   deck      203 non-null    object 
dtypes: float64(2), int64(3), object(4)
memory usage: 62.8+ KB


In [21]:
titanic[titanic.age.isna()]

Unnamed: 0,survived,pclass,gender,age,sibsp,parch,fare,embarked,deck
5,no,3,male,,0,0,8.4583,Q,
17,yes,2,male,,0,0,13.0000,S,
19,yes,3,female,,0,0,7.2250,C,
26,no,3,male,,0,0,7.2250,C,
28,yes,3,female,,0,0,7.8792,Q,
...,...,...,...,...,...,...,...,...,...
859,no,3,male,,0,0,7.2292,C,
863,no,3,female,,8,2,69.5500,S,
868,no,3,male,,0,0,9.5000,S,
878,no,3,male,,0,0,7.8958,S,


In [22]:
titanic.age.mean()

29.69911764705882

In [23]:
mean = round(titanic.age.mean(), 1)
mean

29.7

In [24]:
titanic.age.fillna(mean, inplace = True)

In [26]:
titanic.head(20)

Unnamed: 0,survived,pclass,gender,age,sibsp,parch,fare,embarked,deck
0,no,3,male,22.0,1,0,7.25,S,
1,yes,1,female,38.0,1,0,71.2833,C,C
2,yes,3,female,26.0,0,0,7.925,S,
3,yes,1,female,35.0,1,0,53.1,S,C
4,no,3,male,35.0,0,0,8.05,S,
5,no,3,male,29.7,0,0,8.4583,Q,
6,no,1,male,54.0,0,0,51.8625,S,E
7,no,3,male,2.0,3,1,21.075,S,
8,yes,3,female,27.0,0,2,11.1333,S,
9,yes,2,female,14.0,1,0,30.0708,C,


this fills all null values with the mean

## Detection of Duplicates

In [32]:
alphabet = pd.DataFrame(["a", "b", "c", "c", "d", "e", "f", "g", "g", "g"], columns = ["Alphabet"])

In [33]:
alphabet

Unnamed: 0,Alphabet
0,a
1,b
2,c
3,c
4,d
5,e
6,f
7,g
8,g
9,g


In [34]:
alphabet.duplicated(keep = False)

0    False
1    False
2     True
3     True
4    False
5    False
6    False
7     True
8     True
9     True
dtype: bool

In [35]:
alphabet[alphabet.duplicated(keep = False)]

Unnamed: 0,Alphabet
2,c
3,c
7,g
8,g
9,g


In [36]:
alphabet.duplicated(keep = "first")

0    False
1    False
2    False
3     True
4    False
5    False
6    False
7    False
8     True
9     True
dtype: bool

In [37]:
alphabet[alphabet.duplicated(keep = "first")]

Unnamed: 0,Alphabet
3,c
8,g
9,g
