# Step: 01 Basic data inspection

### Display the first 5 rows of the dataset


In [97]:
# Import Library
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv("../data_scraping/datasets/pop_migration.csv")

# Display the first 5 rows of the DataFrame
df.head()

Unnamed: 0,Country,Year,total_population,net_migration
0,Afghanistan,2023,41454761.0,-48958.0
1,Afghanistan,2022,40578842.0,-647402.0
2,Afghanistan,2021,40000412.0,-548784.0
3,Afghanistan,2020,39068979.0,143634.0
4,Afghanistan,2019,37856121.0,9159.0


### Display the last 5 rows of the dataset

In [98]:
# Display the last 5 rows of the DataFrame
df.tail()

Unnamed: 0,Country,Year,total_population,net_migration
315,Pakistan,1964,50799999.0,-63298.0
316,Pakistan,1963,49447776.0,-64552.0
317,Pakistan,1962,48156128.0,-65780.0
318,Pakistan,1961,46921277.0,-66991.0
319,Pakistan,1960,45709310.0,0.0


### Display random 5 rows of the dataset

In [99]:
df.sample(5)

Unnamed: 0,Country,Year,total_population,net_migration
78,Bangladesh,2009,150873714.0,-1150194.0
209,Sri Lanka,2006,20352411.0,-107045.0
285,Pakistan,1994,130748083.0,-277100.0
245,Sri Lanka,1970,12287110.0,-16299.0
197,Sri Lanka,2018,21670000.0,-28607.0


### Display the Summary of the dataset


In [100]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 320 entries, 0 to 319
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country           320 non-null    object 
 1   Year              320 non-null    int64  
 2   total_population  320 non-null    float64
 3   net_migration     320 non-null    float64
dtypes: float64(2), int64(1), object(1)
memory usage: 10.1+ KB


#### Display the statistical summary of the dataset


In [101]:
df.describe()

Unnamed: 0,Year,total_population,net_migration
count,320.0,320.0,320.0
mean,1991.5,238458600.0,-222220.9
std,18.501885,370145900.0,475187.2
min,1960.0,9035043.0,-2290411.0
25%,1975.75,19523540.0,-322429.5
50%,1991.5,74870590.0,-66385.5
75%,2007.25,185744700.0,2112.75
max,2023.0,1438070000.0,1197329.0


In [102]:
# Now Transpose the summary
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,320.0,1991.5,18.50189,1960.0,1975.75,1991.5,2007.25,2023.0
total_population,320.0,238458600.0,370145900.0,9035043.0,19523535.0,74870587.5,185744700.0,1438070000.0
net_migration,320.0,-222220.9,475187.2,-2290411.0,-322429.5,-66385.5,2112.75,1197329.0


### Display the statistical summary of categorical columns

In [103]:
# Display the categorical summary 
df.describe(include='object')

Unnamed: 0,Country
count,320
unique,5
top,Afghanistan
freq,64


### Check the data Diamension or shape of the dataset


In [104]:
df.shape

(320, 4)

### Check the dataset column name's


In [90]:
df.columns

Index(['Country', 'Year', 'total_population', 'net_migration'], dtype='object')

In [91]:
# Now columns list
df.columns.to_list()

['Country', 'Year', 'total_population', 'net_migration']

### Check the Data types of each column

In [92]:
df.dtypes

Country              object
Year                  int64
total_population    float64
net_migration       float64
dtype: object

# convert the data types of the columns


In [93]:
df["total_population"] = df["total_population"].astype("int64")

df["Country"] = df["Country"].astype("category")

df.dtypes

Country             category
Year                   int64
total_population       int64
net_migration        float64
dtype: object

### Check the Missing values in the dataset


In [94]:
df.isnull().sum()

Country             0
Year                0
total_population    0
net_migration       0
dtype: int64

# Step: 02 Handling Missing values

### Drop row with missing values

In [95]:
df.head()

Unnamed: 0,Country,Year,total_population,net_migration
0,Afghanistan,2023,41454761,-48958.0
1,Afghanistan,2022,40578842,-647402.0
2,Afghanistan,2021,40000412,-548784.0
3,Afghanistan,2020,39068979,143634.0
4,Afghanistan,2019,37856121,9159.0


### Remove the column with missing values


In [96]:
# df.drop(columns = "Year" , inplace = True)

### Drop row with missing values in specific columns

In [108]:
df.dropna(subset = ["Country", "total_population"], inplace = True) # subset 
df.head()

Unnamed: 0,Country,Year,total_population,net_migration
0,Afghanistan,2023,41454761.0,-48958.0
1,Afghanistan,2022,40578842.0,-647402.0
2,Afghanistan,2021,40000412.0,-548784.0
3,Afghanistan,2020,39068979.0,143634.0
4,Afghanistan,2019,37856121.0,9159.0


In [110]:
df["Country"].value_counts()

Country
Afghanistan    64
Bangladesh     64
India          64
Sri Lanka      64
Pakistan       64
Name: count, dtype: int64

# Step: 03 Duplicates and Data Integrity

### Check for duplicates in the dataset

In [115]:
df.duplicated().sum()

0

### Get a Unique value in a column

In [116]:
df["Country"].unique()

array(['Afghanistan', 'Bangladesh', 'India', 'Sri Lanka', 'Pakistan'],
      dtype=object)

### Count a Unique value in a column


In [117]:
df["Country"].nunique()

5