## Import libraries

In [1]:
import pandas as pd
import numpy as np

## Sample dataset

In [2]:
data = {
    'name': ['John', 'Mary', 'Mark', 'Andria', 'Robert','Eva'],
    'age': ['25', 'np.nan', '30', '29', 'NaN', '25'],
    'gender':['Male','Female', 'Male', 'np.nan', 'Male', 'Female'],
    'country': ['USA', 'usa', 'Canada', 'CANADA', 'UK', 'USA'],
    'quantity': [2, 1, 3, 5, 2, 2],
    'unit_price': [20, 15, np.nan, 10, 30, 20]
}

df = pd.DataFrame(data)
df

Unnamed: 0,name,age,gender,country,quantity,unit_price
0,John,25,Male,USA,2,20.0
1,Mary,np.nan,Female,usa,1,15.0
2,Mark,30,Male,Canada,3,
3,Andria,29,np.nan,CANADA,5,10.0
4,Robert,,Male,UK,2,30.0
5,Eva,25,Female,USA,2,20.0


In [3]:
# Handling missing values
df.fillna(0)

Unnamed: 0,name,age,gender,country,quantity,unit_price
0,John,25,Male,USA,2,20.0
1,Mary,np.nan,Female,usa,1,15.0
2,Mark,30,Male,Canada,3,0.0
3,Andria,29,np.nan,CANADA,5,10.0
4,Robert,,Male,UK,2,30.0
5,Eva,25,Female,USA,2,20.0


In [4]:
#removing rows with missing values
df.dropna()

Unnamed: 0,name,age,gender,country,quantity,unit_price
0,John,25,Male,USA,2,20.0
1,Mary,np.nan,Female,usa,1,15.0
3,Andria,29,np.nan,CANADA,5,10.0
4,Robert,,Male,UK,2,30.0
5,Eva,25,Female,USA,2,20.0


In [5]:
#Removing duplicates
df.drop_duplicates()

Unnamed: 0,name,age,gender,country,quantity,unit_price
0,John,25,Male,USA,2,20.0
1,Mary,np.nan,Female,usa,1,15.0
2,Mark,30,Male,Canada,3,
3,Andria,29,np.nan,CANADA,5,10.0
4,Robert,,Male,UK,2,30.0
5,Eva,25,Female,USA,2,20.0


In [6]:
#Correcting datatypes
#df['age'] = df['age'].astype(int)

#Removing incosnsistant datatypes
df['country'] = df['country'].str.lower()

print(df)

     name     age  gender country  quantity  unit_price
0    John      25    Male     usa         2        20.0
1    Mary  np.nan  Female     usa         1        15.0
2    Mark      30    Male  canada         3         NaN
3  Andria      29  np.nan  canada         5        10.0
4  Robert     NaN    Male      uk         2        30.0
5     Eva      25  Female     usa         2        20.0


In [17]:
# Correcting datatypes
df['age'] = pd.to_numeric(df['age'], errors='coerce')

df['age'] = df['age'].fillna(df['age'].median())
df['age'] = df['age'].astype(int)

## Data preparation
- Making data ready for analysing, e.g visualising, stats etc
- Aslso includes feature engineering making new coloumns from existing ones

In [7]:
#Feature engineering
df['total_price'] = df['quantity'] * df['unit_price'] # total_price coloumn just created now
print(df)

     name     age  gender country  quantity  unit_price  total_price
0    John      25    Male     usa         2        20.0         40.0
1    Mary  np.nan  Female     usa         1        15.0         15.0
2    Mark      30    Male  canada         3         NaN          NaN
3  Andria      29  np.nan  canada         5        10.0         50.0
4  Robert     NaN    Male      uk         2        30.0         60.0
5     Eva      25  Female     usa         2        20.0         40.0


## Encoding catergorical values
- Converting text labels into specific numbers

In [8]:
# Encoding categorical values 
pd.get_dummies(df['gender']) #one hot encoding is applied here

Unnamed: 0,Female,Male,np.nan
0,False,True,False
1,True,False,False
2,False,True,False
3,False,False,True
4,False,True,False
5,True,False,False


## Scaling and normalization
- for machine learning models

In [14]:
from sklearn.preprocessing import MinMaxScaler

In [18]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[['total_price']] = scaler.fit_transform(df[['total_price']])


## Spliting data for training and testing

In [21]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2)

## Data visualization

In [22]:
# Using a new data set
import pandas as pd

data = {
    'name': ['John', 'Mary', 'Mark', 'Andria'],
    'Age': ['25', None, '30', None],
    'City':['NY','LA', None, 'Chicago']
}
df = pd.DataFrame(data)
print(df)

     name   Age     City
0    John    25       NY
1    Mary  None       LA
2    Mark    30     None
3  Andria  None  Chicago


In [25]:
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

# Filling in missing values
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['City'] = df['City'].fillna('Unknown')
print(df)

     name   Age     City
0    John  25.0       NY
1    Mary  27.5       LA
2    Mark  30.0  Unknown
3  Andria  27.5  Chicago


In [27]:
df = pd.DataFrame({
    'ID': [1, 2, 2, 3],
    'Name': ['John', 'Mary', 'Mark', 'John'],
})

df = df.drop_duplicates()
print(df)

   ID  Name
0   1  John
1   2  Mary
2   2  Mark
3   3  John


In [30]:
#Feature engineering (encoding)
df = pd.DataFrame({'Gender': ['Male', 'Female', 'Female', 'Male']})
encoded = pd.get_dummies(df['Gender'])
encoded = encoded.astype(int)
print(encoded)

   Female  Male
0       0     1
1       1     0
2       1     0
3       0     1
