Dataset Validation

In [24]:
import numpy as np
import pandas as pd

In [25]:
df = pd.read_csv("Cleaned_SLU_Opportunity_Wise_Data.csv")

In [26]:
# Top 10 most frequent institutions
top_institutions = df["Institution Name"].value_counts().head(10)
print(top_institutions)


Institution Name
Saint Louis University                                3487
Not Applicable                                         777
Illinois Institute Of Technology                       151
Webster University                                      61
Kwame Nkrumah University Of Science And Technology      47
Srm University                                          31
University Of Ibadan                                    27
Saint Louis Univeristy                                  25
Vishnu Institute Of Technology                          25
Chandigarh University                                   24
Name: count, dtype: int64


In [27]:
# Correct the spelling in the 'Institution Name' column
df["Institution Name"] = df["Institution Name"].replace("Saint Louis Univeristy", "Saint Louis University")

# Verify the correction by checking the top 10 again
top_institutions = df["Institution Name"].value_counts().head(10)
print(top_institutions)


Institution Name
Saint Louis University                                3512
Not Applicable                                         777
Illinois Institute Of Technology                       151
Webster University                                      61
Kwame Nkrumah University Of Science And Technology      47
Srm University                                          31
University Of Ibadan                                    27
Vishnu Institute Of Technology                          25
Vellore Institute Of Technology                         24
Chandigarh University                                   24
Name: count, dtype: int64


Final checking of dataset

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8246 entries, 0 to 8245
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Learner SignUp DateTime  8246 non-null   object
 1   Opportunity Id           8246 non-null   object
 2   Opportunity Name         8246 non-null   object
 3   Opportunity Category     8246 non-null   object
 4   Opportunity End Date     8246 non-null   object
 5   First Name               8246 non-null   object
 6   Date of Birth            8246 non-null   object
 7   Gender                   8246 non-null   object
 8   Country                  8246 non-null   object
 9   Institution Name         8246 non-null   object
 10  Current/Intended Major   8246 non-null   object
 11  Entry created at         8246 non-null   object
 12  Status Description       8246 non-null   object
 13  Status Code              8246 non-null   int64 
 14  Apply Date               8246 non-null  

In [29]:
df.isnull().sum()

Learner SignUp DateTime    0
Opportunity Id             0
Opportunity Name           0
Opportunity Category       0
Opportunity End Date       0
First Name                 0
Date of Birth              0
Gender                     0
Country                    0
Institution Name           0
Current/Intended Major     0
Entry created at           0
Status Description         0
Status Code                0
Apply Date                 0
Opportunity Start Date     0
dtype: int64

In [30]:
df.duplicated().sum()

np.int64(0)

In [31]:
print("Total Rows: ", df.shape[0])
print("Total columns: ", df.shape[1])

Total Rows:  8246
Total columns:  16


In [32]:
# Convert date columns to datetime
date_cols = ["Learner SignUp DateTime", "Opportunity End Date", "Date of Birth",
             "Entry created at", "Apply Date", "Opportunity Start Date"]

for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# Convert categorical columns to category type
cat_cols = ["Opportunity Category", "Gender", "Country", "Institution Name",
            "Current/Intended Major", "Status Description"]

for col in cat_cols:
    df[col] = df[col].astype('category')

# Verify
print(df.dtypes)


Learner SignUp DateTime    datetime64[ns]
Opportunity Id                     object
Opportunity Name                   object
Opportunity Category             category
Opportunity End Date       datetime64[ns]
First Name                         object
Date of Birth              datetime64[ns]
Gender                           category
Country                          category
Institution Name                 category
Current/Intended Major           category
Entry created at           datetime64[ns]
Status Description               category
Status Code                         int64
Apply Date                 datetime64[ns]
Opportunity Start Date     datetime64[ns]
dtype: object


In [33]:
df.isnull().sum()

Learner SignUp DateTime    0
Opportunity Id             0
Opportunity Name           0
Opportunity Category       0
Opportunity End Date       0
First Name                 0
Date of Birth              0
Gender                     0
Country                    0
Institution Name           0
Current/Intended Major     0
Entry created at           0
Status Description         0
Status Code                0
Apply Date                 0
Opportunity Start Date     0
dtype: int64

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8246 entries, 0 to 8245
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Learner SignUp DateTime  8246 non-null   datetime64[ns]
 1   Opportunity Id           8246 non-null   object        
 2   Opportunity Name         8246 non-null   object        
 3   Opportunity Category     8246 non-null   category      
 4   Opportunity End Date     8246 non-null   datetime64[ns]
 5   First Name               8246 non-null   object        
 6   Date of Birth            8246 non-null   datetime64[ns]
 7   Gender                   8246 non-null   category      
 8   Country                  8246 non-null   category      
 9   Institution Name         8246 non-null   category      
 10  Current/Intended Major   8246 non-null   category      
 11  Entry created at         8246 non-null   datetime64[ns]
 12  Status Description       8246 non-

Exporting Cleaned Dataset

In [35]:
df.to_csv("Cleaned_SLU_Opportunity_Wise_Dataset.csv", index=False)