In [10]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


# Load the dataset 
df = pd.read_csv('C:/Users/Admin/Desktop/Final Exam prep/urbanandruralpopulation new.csv')

# Display the first few rows of the dataset
print("First 5 rows of the dataset:")
print(df.head())

First 5 rows of the dataset:
       country Code country  Year  Urban population Rural population
0  Afghanistan          AFG  1960            724373          7898093
1  Afghanistan          AFG  1961            763336          8026804
2  Afghanistan          AFG  1962            805062          8163985
3  Afghanistan          AFG  1963            849446          8308019
4  Afghanistan          AFG  1964            896820          8458694


In [12]:
# Checking the shape of the dataset (size)
print("\nDataset size (rows, columns):")
print(df.shape)



Dataset size (rows, columns):
(14168, 5)


In [13]:
# Display columns and their data types
print("\nColumns and data types:")
print(df.dtypes)


Columns and data types:
country             object
Code country        object
Year                 int64
Urban population     int64
Rural population    object
dtype: object


In [14]:
# Get basic statistics for numerical columns
print("\nSummary statistics for numerical columns:")
print(df.describe())


Summary statistics for numerical columns:
               Year  Urban population
count  14168.000000      1.416800e+04
mean    1990.533879      5.097131e+07
std       17.895499      2.442102e+08
min     1960.000000      8.590000e+02
25%     1975.000000      2.385732e+05
50%     1991.000000      2.029394e+06
75%     2006.000000      8.821502e+06
max     2021.000000      4.454153e+09


In [15]:
# Checking for missing data
print("\nChecking for missing data:")
print(df.isnull().sum())


Checking for missing data:
country             0
Code country        0
Year                0
Urban population    0
Rural population    0
dtype: int64


In [17]:
# Slice the dataset - Display rows 0 to 5 and columns 'Year', 'Urban population', 'Rural population'
print("\nSlicing data (rows 0-5, columns 'Year', 'Urban population', 'Rural population'):")
print(df.loc[0:5, ['Year', 'Urban population', 'Rural population']])


Slicing data (rows 0-5, columns 'Year', 'Urban population', 'Rural population'):
   Year  Urban population Rural population
0  1960            724373          7898093
1  1961            763336          8026804
2  1962            805062          8163985
3  1963            849446          8308019
4  1964            896820          8458694
5  1965            947332          8617815


In [20]:
# Data Cleaning and Preprocessing

# Handling missing data - Filling missing values with mean for numerical columns only
numeric_cols = df.select_dtypes(include=[np.number]).columns  # Select numeric columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# For non-numeric columns, you can fill missing values differently, e.g., with mode (most frequent value)
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns
for col in non_numeric_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)  # Fill missing values with the most frequent value (mode)

# Now continue with your normalization step
# Normalizing 'Urban population' and 'Rural population' columns if they exist and are numeric
if 'Urban population' in numeric_cols and 'Rural population' in numeric_cols:
    scaler = StandardScaler()
    df[['Urban population', 'Rural population']] = scaler.fit_transform(df[['Urban population', 'Rural population']])

# Checking if the data is now clean
print("\nData after cleaning and normalization (first 5 rows):")
print(df)



Data after cleaning and normalization (first 5 rows):
           country Code country  Year  Urban population Rural population
0      Afghanistan          AFG  1960            724373          7898093
1      Afghanistan          AFG  1961            763336          8026804
2      Afghanistan          AFG  1962            805062          8163985
3      Afghanistan          AFG  1963            849446          8308019
4      Afghanistan          AFG  1964            896820          8458694
...            ...          ...   ...               ...              ...
14163     Zimbabwe          ZWE  2017           4755312          9995789
14164     Zimbabwe          ZWE  2018           4848158         10204026
14165     Zimbabwe          ZWE  2019           4945719         10408889
14166     Zimbabwe          ZWE  2020           5052214         10617452
14167     Zimbabwe          ZWE  2021           5166388         10827136

[14168 rows x 5 columns]


In [21]:
# Saving the cleaned dataset
df.to_csv('cleaned_dataset.csv', index=False)