In [1]:
# 1) Import all the required Python Libraries
import pandas as pd
import numpy as np

In [2]:
# 2) Description of the data:
# Dataset Name: Students Data
# Description: Contains details like Name, Age, Gender, Marks, and City for students.
# Source: Custom-made CSV for practical

In [3]:
# 3) Load the Dataset into pandas dataframe
df = pd.read_csv('Students.csv')

In [4]:
df

Unnamed: 0,Name,Age,Gender,Marks,City
0,John,22.0,Male,85,Pune
1,Sara,21.0,Female,90,Mumbai
2,Alex,23.0,Male,78,Delhi
3,Mary,22.0,Female,88,Bangalore
4,Tom,,Male,76,Chennai
5,Rose,24.0,Female,95,Mumbai


In [5]:
# 4) Data Preprocessing
# Check for missing values
print("Missing Values:")
df.isnull().sum()

Missing Values:


Name      0
Age       1
Gender    0
Marks     0
City      0
dtype: int64

In [6]:
# Get initial statistics
print("Statistical Summary:")
df.describe(include='all')

Statistical Summary:


Unnamed: 0,Name,Age,Gender,Marks,City
count,6,5.0,6,6.0,6
unique,6,,2,,5
top,John,,Male,,Mumbai
freq,1,,3,,2
mean,,22.4,,85.333333,
std,,1.140175,,7.25718,
min,,21.0,,76.0,
25%,,22.0,,79.75,
50%,,22.0,,86.5,
75%,,23.0,,89.5,


In [7]:
# Variable descriptions:
# Name: Character
# Age: Numeric (float because of missing value)
# Gender: Character
# Marks: Numeric (int/float)
# City: Character

In [8]:
# Check the dimensions of the dataframe
print("Dimensions of DataFrame:",df.shape)
print("Number of Dimensions:",df.ndim)

Dimensions of DataFrame: (6, 5)
Number of Dimensions: 2


In [9]:
# 5) Data Formatting and Normalization
# Check data types
print("\nData Types Before Conversion:\n", df.dtypes)


Data Types Before Conversion:
 Name       object
Age       float64
Gender     object
Marks       int64
City       object
dtype: object


In [10]:
# filling missing values
df['Age'] = df['Age'].fillna(df['Age'].mean())
df

Unnamed: 0,Name,Age,Gender,Marks,City
0,John,22.0,Male,85,Pune
1,Sara,21.0,Female,90,Mumbai
2,Alex,23.0,Male,78,Delhi
3,Mary,22.0,Female,88,Bangalore
4,Tom,22.4,Male,76,Chennai
5,Rose,24.0,Female,95,Mumbai


In [11]:
df['Age'] = df['Age'].astype(int)

In [12]:
# Confirm data types after conversion
print("\nData Types After Conversion:\n", df.dtypes)


Data Types After Conversion:
 Name      object
Age        int64
Gender    object
Marks      int64
City      object
dtype: object


In [13]:
# 6) Turn categorical variables into quantitative variables
# Convert 'Gender' and 'City' into dummy variables
gender_dummies = pd.get_dummies(df['Gender'], prefix='Gender')
city_dummies = pd.get_dummies(df['City'], prefix='City')

In [14]:
gender_dummies

Unnamed: 0,Gender_Female,Gender_Male
0,False,True
1,True,False
2,False,True
3,True,False
4,False,True
5,True,False


In [15]:
city_dummies

Unnamed: 0,City_Bangalore,City_Chennai,City_Delhi,City_Mumbai,City_Pune
0,False,False,False,False,True
1,False,False,False,True,False
2,False,False,True,False,False
3,True,False,False,False,False
4,False,True,False,False,False
5,False,False,False,True,False


In [16]:
# Concatenate dummy variables with main dataframe
df_final = pd.concat([df, gender_dummies, city_dummies], axis=1)

# Drop original 'Gender' and 'City' columns
df_final.drop(['Gender', 'City'], axis=1, inplace=True)

# Final DataFrame
print("\nFinal DataFrame after Converting Categorical Variables:\n")
df_final


Final DataFrame after Converting Categorical Variables:



Unnamed: 0,Name,Age,Marks,Gender_Female,Gender_Male,City_Bangalore,City_Chennai,City_Delhi,City_Mumbai,City_Pune
0,John,22,85,False,True,False,False,False,False,True
1,Sara,21,90,True,False,False,False,False,True,False
2,Alex,23,78,False,True,False,False,True,False,False
3,Mary,22,88,True,False,True,False,False,False,False
4,Tom,22,76,False,True,False,True,False,False,False
5,Rose,24,95,True,False,False,False,False,True,False


In [17]:
df_final.replace(False,0,inplace=True)
df_final

Unnamed: 0,Name,Age,Marks,Gender_Female,Gender_Male,City_Bangalore,City_Chennai,City_Delhi,City_Mumbai,City_Pune
0,John,22,85,0,True,0,0,0,0,True
1,Sara,21,90,True,0,0,0,0,True,0
2,Alex,23,78,0,True,0,0,True,0,0
3,Mary,22,88,True,0,True,0,0,0,0
4,Tom,22,76,0,True,0,True,0,0,0
5,Rose,24,95,True,0,0,0,0,True,0


In [18]:
df_final.replace(True,1,inplace=True)
df_final

  df_final.replace(True,1,inplace=True)


Unnamed: 0,Name,Age,Marks,Gender_Female,Gender_Male,City_Bangalore,City_Chennai,City_Delhi,City_Mumbai,City_Pune
0,John,22,85,0,1,0,0,0,0,1
1,Sara,21,90,1,0,0,0,0,1,0
2,Alex,23,78,0,1,0,0,1,0,0
3,Mary,22,88,1,0,1,0,0,0,0
4,Tom,22,76,0,1,0,1,0,0,0
5,Rose,24,95,1,0,0,0,0,1,0


In [19]:
df_final

Unnamed: 0,Name,Age,Marks,Gender_Female,Gender_Male,City_Bangalore,City_Chennai,City_Delhi,City_Mumbai,City_Pune
0,John,22,85,0,1,0,0,0,0,1
1,Sara,21,90,1,0,0,0,0,1,0
2,Alex,23,78,0,1,0,0,1,0,0
3,Mary,22,88,1,0,1,0,0,0,0
4,Tom,22,76,0,1,0,1,0,0,0
5,Rose,24,95,1,0,0,0,0,1,0
