# Dataset: ApplicantData.csv
# Data Cleaning Process

Importing necessary libraries

In [173]:
import numpy as np
import pandas as pd
import re

Loading Dataset 

In [174]:
df = pd.read_csv("ApplicantData.csv")

Total Rows & Columns

In [175]:
print(f"Total Rows: {df.shape[0]}")
print(f"Total Columns: {df.shape[1]}")

Total Rows: 37882
Total Columns: 4


Checking Datatype

In [176]:
print(df.dtypes)

App_ID          object
Country         object
University      object
Phone_Number    object
dtype: object


Missing Values Checking

In [177]:
print(df.isnull().sum())

App_ID          1
Country         0
University      0
Phone_Number    0
dtype: int64


unique values for each column- consistency checking

In [178]:
# Show unique values for each column
for col in df.columns:
    unique_vals = df[col].unique()
    print(f"Column: {col}")
    print(unique_vals)
    print(f"Total Unique: {len(unique_vals)}\n")


Column: App_ID
['12345' '347397' '358065' ... ',,,,,' '/////////' nan]
Total Unique: 15417

Column: Country
['India' 'Nigeria' 'saarthaksingh05@gmail.com' 'satya.sai1881@gmail.com'
 'sharmaishaan16@gmail.com' 'pillir1026@outlook.com'
 'shalinidec05@gmail.com' 'rameshpriyanka536@gmail.com'
 'samjainsamj16@gmail.com' 'ratneshry06@gmail.com'
 'dhruvvekariya.vmc18@gmail.com' 'kanishk.9871@gmail.com'
 'gayathri.keenala@gmail.com' 'lakshaymunjal17@gmail.com'
 'shah.rahulsailesh@gmail.com' 'sekhar.fall2022@gmail.com'
 'glourdurani@gmail.com' 'polishetty.jyothi08@gmail.com'
 'chsrilatha.eduf23@gmail.com' 'saimukeshjakkula2023@gmail.com'
 'phanendrakatta@gmail.com' 'garv.career@gmail.com'
 'likithapalakolanu08@gmail.com' 'saisandeep.pashem007@gmail.com'
 'nehareddy6721@gmail.com' 'polanisaibalaji@gmail.com'
 'reddyabhinay037@gmail.com' 'ganipineni.vinay999@gmail.com'
 'pasupulatepavankalyan583@gmail.com' 'ganesh.edu2023@gmail.com'
 'gauthamkanyadi@gmail.com' 'riyasubashmendon@gmail.com'
 'ravit

App_ID column

1 missing value & strange entries - removed invalid App_ID

In [179]:
# Convert App_ID to numeric, coercing errors to NaN
df['App_ID'] = pd.to_numeric(df['App_ID'], errors='coerce')

# Remove rows with NaN App_ID (these were invalid/non-numeric)
df = df.dropna(subset=['App_ID'])

# Convert App_ID back to object (string)
df['App_ID'] = df['App_ID'].astype(int).astype(str)

# Check unique App_IDs and datatype
print("Unique App_IDs:", df['App_ID'].unique()[:20])  # show first 20
print("Total unique App_IDs:", df['App_ID'].nunique())
print("Data type of App_ID:", df['App_ID'].dtype)


Unique App_IDs: ['12345' '347397' '358065' '351333' '346435' '355959' '351520' '372165'
 '369273' '365995' '348627' '350814' '357845' '336070' '345428' '352791'
 '363532' '367273' '346775' '355929']
Total unique App_IDs: 15175
Data type of App_ID: object


Country Column

Typos and inconsistent casing fixed

Emails and descriptive text replaced with 'Unknown'

In [180]:
# Replace only email-like entries with 'Unknown'
df['Country'] = df['Country'].apply(lambda x: 'Unknown' if isinstance(x, str) and '@' in x else x)

# Check unique values to verify
print(df['Country'].unique())


['India' 'Nigeria' 'Unknown' 'na' 'naq' '-' 'Taiwan' 'Pakistan'
 'Bangladesh' 'China' 'South Africa'
 'he got admitted into another university'
 'Not attending Illinois Tech, going to a higher ranked university'
 'Afghanistan' 'Azerbaijan' 'Algeria' 'Armenia' 'Bahrain' 'Bolivia'
 'Botswana' 'Brazil' 'Cameroon' 'Burundi'
 'Congo, the Democratic Republic of the' 'Djibouti' 'Egypt' 'Eritrea'
 'Estonia' 'Ethiopia' 'Gambia the' 'Ghana' 'Argentina' 'Belarus' 'Finland'
 'Germany' 'Canada' 'Iran' 'Nepal' 'Myanmar' 'Kenya' 'South Korea'
 'Ecuador' 'France' 'Yemen' 'Zambia' 'Zimbabwe' 'Sri Lanka' 'Rwanda'
 'Saudi Arabia'
 'student wants to change the degree from Doctor of Philosophy to Masters'
 'not able to provide official transcript' 'Indonesia' 'Turkey'
 'Australia' 'Lebanon' 'Kuwait' 'Malawi' 'Brasil' 'Colombia'
 'Ethiopia, Italy' 'Ethiopia , Italy' "Cote d'Ivoire" 'Angola'
 'C�te dIvoire' 'Cyprus' 'Dominica' 'Czech Republic'
 'United Arab Emirates' 'Uganda' 'Sri lanka' 'Spain' 'Sierra Leon

In [181]:
# Mapping of original -> corrected country
country_corrections = {
    # Unknown / invalid entries
    'Unknown': 'Unknown', 'na': 'Unknown', 'naq': 'Unknown', '-': 'Unknown',
    'personal reason': 'Unknown', 'no planes': 'Unknown', 'Not connected': 'Unknown',
    'Finance': 'Unknown', 'he got admitted into another university': 'Unknown',
    'Not attending Illinois Tech, going to a higher ranked university': 'Unknown',
    'student wants to change the degree from Doctor of Philosophy to Masters': 'Unknown',
    'not able to provide official transcript': 'Unknown', 'Onam': 'Unknown', 'onam': 'Unknown',

    # Standardized countries
    'India': 'India', 'india': 'India',
    'Nigeria': 'Nigeria', 'nigeira': 'Nigeria', 'NIgeria': 'Nigeria', 'nigera': 'Nigeria', 'nigeria': 'Nigeria',
    'Pakistan': 'Pakistan', 'Bangladesh': 'Bangladesh', 'China': 'China', 'china': 'China',
    'South Africa': 'South Africa', 'Taiwan': 'Taiwan', 'Taiwan,': 'Taiwan', 'Taiwan, China': 'Taiwan',
    'Afghanistan': 'Afghanistan', 'Azerbaijan': 'Azerbaijan', 'Algeria': 'Algeria', 'Armenia': 'Armenia',
    'Bahrain': 'Bahrain', 'Behrain': 'Bahrain', 'Bolivia': 'Bolivia', 'Botswana': 'Botswana', 'botswana': 'Botswana',
    'Brazil': 'Brazil', 'brasil': 'Brazil', 'Burundi': 'Burundi', 'Cameroon': 'Cameroon', 'cameroon': 'Cameroon',
    'Congo, the Democratic Republic of the': 'Congo', 'Congo': 'Congo', 'Congo Republic': 'Congo', 
    'Congo republic': 'Congo', 'congo republic': 'Congo',
    'Djibouti': 'Djibouti', 'Egypt': 'Egypt', 'egypt': 'Egypt', 'Eritrea': 'Eritrea', 'Estonia': 'Estonia',
    'Ethiopia': 'Ethiopia', 'ethiopia': 'Ethiopia', 'ethiopi': 'Ethiopia',
    'Gambia the': 'Gambia', 'Gambia': 'Gambia', 'gambia': 'Gambia', 'Ghana': 'Ghana', 'ghana': 'Ghana',
    'Argentina': 'Argentina', 'argentina': 'Argentina', 'Belarus': 'Belarus', 'Finland': 'Finland',
    'Germany': 'Germany', 'Canada': 'Canada', 'canada': 'Canada', 'Iran': 'Iran', 'Nepal': 'Nepal',
    'nepal': 'Nepal', 'Nepaal': 'Nepal', 'Myanmar': 'Myanmar', 'Myanmmar': 'Myanmar', 'myanmar': 'Myanmar',
    'Kenya': 'Kenya', 'KEnya': 'Kenya', 'kenya': 'Kenya', 'South Korea': 'South Korea', 'Republic of Korea': 'South Korea',
    'Ecuador': 'Ecuador', 'France': 'France', 'france': 'France', 'Yemen': 'Yemen', 'Zambia': 'Zambia', 'zambia': 'Zambia',
    'Zimbabwe': 'Zimbabwe', 'Sri Lanka': 'Sri Lanka', 'Sri lanka': 'Sri Lanka', 'Srilanka': 'Sri Lanka', 'sri lanka': 'Sri Lanka',
    'Rwanda': 'Rwanda', 'Saudi Arabia': 'Saudi Arabia', 'saudi arabia': 'Saudi Arabia', 'Indonesia': 'Indonesia',
    'Indinesia': 'Indonesia', 'Turkey': 'Turkey', 'Australia': 'Australia', 'australia': 'Australia',
    'Lebanon': 'Lebanon', 'Kuwait': 'Kuwait', 'Malawi': 'Malawi', 'malawi': 'Malawi', 'Cote d\'Ivoire': 'Cote Divoire',
    'Cote d ivoire': 'Cote Divoire', 'C�te dIvoire': 'Cote Divoire', 'Cote D Ivoire': 'Cote Divoire',
    'Angola': 'Angola', 'Cyprus': 'Cyprus', 'Dominica': 'Dominica', 'Czech Republic': 'Czech Republic',
    'United Arab Emirates': 'United Arab Emirates', 'Uganda': 'Uganda', 'Spain': 'Spain', 'spain': 'Spain',
    'Sierra Leone': 'Sierra Leone', 'sierra leone': 'Sierra Leone', 'Somalia': 'Somalia', 'somalia': 'Somalia',
    'Tanzania': 'Tanzania', 'Italy': 'Italy', 'Italia': 'Italy', 'italy': 'Italy', 'Israel': 'Israel',
    'Jordan': 'Jordan', 'Jordon': 'Jordan', 'Kazakhstan': 'Kazakhstan', 'kazakhstan': 'Kazakhstan',
    'Mongolia': 'Mongolia', 'Panama': 'Panama', 'Uzbekistan': 'Uzbekistan', 'ubekistan': 'Uzbekistan',
    'Tajikistan': 'Tajikistan', 'Tazakkistan': 'Tajikistan', 'Syria': 'Syria', 'Sweden': 'Sweden', 'Norway': 'Norway',
    'Swaziland': 'Eswatini', 'Russian Federation': 'Russia', 'Russia': 'Russia', 'RUssia': 'Russia', 'russia': 'Russia',
    'Sudan': 'Sudan', 'sudan': 'Sudan', 'Latvia': 'Latvia', 'Morocco': 'Morocco', 'Mororcco': 'Morocco', 'morocco': 'Morocco',
    'Georgia': 'Georgia', 'Mozambique': 'Mozambique', 'Liberia': 'Liberia', 'liberia': 'Liberia', 'Malaysia': 'Malaysia',
    'malaysia': 'Malaysia', 'Hong Kong': 'Hong Kong', 'hong kong': 'Hong Kong', 'HK': 'Hong Kong', 'hk': 'Hong Kong',
    'Hong Kong, China': 'Hong Kong', 'Vietnam': 'Vietnam', 'Honduras': 'Honduras', 'Iceland': 'Iceland', 'Guyana': 'Guyana',
    'South Sudan': 'South Sudan', 'south sudan': 'South Sudan', 'United Kingdom': 'United Kingdom', 'England': 'United Kingdom',
    'uk': 'United Kingdom', 'United States of America': 'United States of America', 'Usa': 'United States of America',
    'Poland': 'Poland', 'poland': 'Poland', 'Switzerland': 'Switzerland', 'Singapore': 'Singapore', 'singapore': 'Singapore',
    'Philippines': 'Philippines', 'Portugal': 'Portugal', 'Palestine': 'Palestine', 'Netherlands': 'Netherlands', 'Netherland': 'Netherlands',
    'Thailand': 'Thailand', 'Libya': 'Libya', 'libya': 'Libya', 'Iraq': 'Iraq', 'iraq': 'Iraq', 'Senegal': 'Senegal', 'senegal': 'Senegal',
    'Paraguay': 'Paraguay', 'Mexico': 'Mexico', 'mexico': 'Mexico', 'Namibia': 'Namibia', 'Ireland': 'Ireland', 'Lithuania': 'Lithuania',
    'Seychelles': 'Seychelles', 'Cayman Islands': 'Cayman Islands', 'Nicaragua': 'Nicaragua', 'Chile': 'Chile', 'chile': 'Chile',
    'Cambodia': 'Cambodia', 'Albania': 'Albania', 'Jamaica': 'Jamaica', 'jamica': 'Jamaica', 'Gabon': 'Gabon', 'goban': 'Gabon',
    'Madagascar': 'Madagascar', 'Burkina Faso': 'Burkina Faso', 'Cabo Verde': 'Cabo Verde', 'cabo verde': 'Cabo Verde'
}

# Apply corrections
df['Country'] = df['Country'].replace(country_corrections)

# Optional: check
print(df['Country'].unique())

['India' 'Nigeria' 'Unknown' 'Taiwan' 'Pakistan' 'Bangladesh' 'China'
 'South Africa' 'Afghanistan' 'Azerbaijan' 'Algeria' 'Armenia' 'Bahrain'
 'Bolivia' 'Botswana' 'Brazil' 'Cameroon' 'Burundi' 'Congo' 'Djibouti'
 'Egypt' 'Eritrea' 'Estonia' 'Ethiopia' 'Gambia' 'Ghana' 'Argentina'
 'Belarus' 'Finland' 'Germany' 'Canada' 'Iran' 'Nepal' 'Myanmar' 'Kenya'
 'South Korea' 'Ecuador' 'France' 'Yemen' 'Zambia' 'Zimbabwe' 'Sri Lanka'
 'Rwanda' 'Saudi Arabia' 'Indonesia' 'Turkey' 'Australia' 'Lebanon'
 'Kuwait' 'Malawi' 'Brasil' 'Colombia' 'Ethiopia, Italy'
 'Ethiopia , Italy' 'Cote Divoire' 'Angola' 'Cyprus' 'Dominica'
 'Czech Republic' 'United Arab Emirates' 'Uganda' 'Spain' 'Sierra Leone'
 'Somalia' 'Tanzania' 'Italy' 'Israel' 'Jordan' 'Kazakhstan' 'Mongolia'
 'Panama' 'Uzbekistan' 'Tajikistan' 'Syria' 'Sweden' 'Norway' 'Eswatini'
 'Russia' 'Sudan' 'Latvia' 'Morocco' 'Georgia' 'Mozambique' 'Liberia'
 'Malaysia' 'Hong Kong' 'Vietnam' 'Honduras' 'Iceland' 'Guyana' 'Japan'
 'South Sudan' 'Turkm

Typos / lowercase – should be capitalized properly:

In [182]:
# Dictionary to correct typos and lowercase
country_corrections = {
    'Nigeira': 'Nigeria',
    'albania': 'Albania',
    'brazil': 'Brazil',
    'japan': 'Japan',
    'tazakkistan': 'Tajikistan',
    'ukraine': 'Ukraine',
    'columbia': 'Colombia',
    'peru': 'Peru',
    'saudi arabia': 'Saudi Arabia',
    'taiwan': 'Taiwan',
    'Jamaica': 'Jamaica',
    'rwanda': 'Rwanda',
    'cote d ivoire': 'Cote Divoire',
    'uzbekistan': 'Uzbekistan',
    'Benin': 'Benin',
    'oman': 'Oman',
    'armenia': 'Armenia',
    'Gabon': 'Gabon'
}

# Apply corrections
df['Country'] = df['Country'].replace(country_corrections)

# Standardize casing for all entries
df['Country'] = df['Country'].str.title()

# Correct 'Brasil' to 'Brazil'
df['Country'] = df['Country'].replace({'Brasil': 'Brazil'})

# Replace multi-country or invalid entries containing ',' with 'Unknown'
df['Country'] = df['Country'].apply(lambda x: 'Unknown' if isinstance(x, str) and ',' in x else x)

# Check unique values after correction
print(df['Country'].unique())


['India' 'Nigeria' 'Unknown' 'Taiwan' 'Pakistan' 'Bangladesh' 'China'
 'South Africa' 'Afghanistan' 'Azerbaijan' 'Algeria' 'Armenia' 'Bahrain'
 'Bolivia' 'Botswana' 'Brazil' 'Cameroon' 'Burundi' 'Congo' 'Djibouti'
 'Egypt' 'Eritrea' 'Estonia' 'Ethiopia' 'Gambia' 'Ghana' 'Argentina'
 'Belarus' 'Finland' 'Germany' 'Canada' 'Iran' 'Nepal' 'Myanmar' 'Kenya'
 'South Korea' 'Ecuador' 'France' 'Yemen' 'Zambia' 'Zimbabwe' 'Sri Lanka'
 'Rwanda' 'Saudi Arabia' 'Indonesia' 'Turkey' 'Australia' 'Lebanon'
 'Kuwait' 'Malawi' 'Colombia' 'Cote Divoire' 'Angola' 'Cyprus' 'Dominica'
 'Czech Republic' 'United Arab Emirates' 'Uganda' 'Spain' 'Sierra Leone'
 'Somalia' 'Tanzania' 'Italy' 'Israel' 'Jordan' 'Kazakhstan' 'Mongolia'
 'Panama' 'Uzbekistan' 'Tajikistan' 'Syria' 'Sweden' 'Norway' 'Eswatini'
 'Russia' 'Sudan' 'Latvia' 'Morocco' 'Georgia' 'Mozambique' 'Liberia'
 'Malaysia' 'Hong Kong' 'Vietnam' 'Honduras' 'Iceland' 'Guyana' 'Japan'
 'South Sudan' 'Turkmenistan' 'United Kingdom' 'United States Of Ame

University

Only 'Illinois Institute of Technology' → already standardized

Phone Number Column

Numbers like '636989793' will remain if they have ≥7 digits

Numbers like '+234 5467121496' become '+2345467121496'

Invalid or missing numbers become 'Unknown'


In [183]:
def clean_phone_keep_all(number):
    if pd.isna(number):
        return 'Unknown'  # keep NaN as Unknown, optional
    number = str(number).strip()
    
    # Keep leading '+' if present
    if number.startswith('+'):
        number = '+' + re.sub(r'\D', '', number[1:])
    else:
        number = re.sub(r'\D', '', number)
    
    return number

# Apply cleaning
df['Phone_Number'] = df['Phone_Number'].apply(clean_phone_keep_all)

# Check unique values
print(df['Phone_Number'].unique())


['9823241234' '8805617501' '18019011222' ... '917989841011'
 '+2347030851281' '+2345467121496']


In [184]:
# Filter for specific App_IDs
specific_ids = ['398641', '441728']  # make sure these are strings
print(df[df['App_ID'].isin(specific_ids)][['App_ID', 'Phone_Number']])


       App_ID                                       Phone_Number
12249  441728                                       989128914705
13554  441728                                       989128914705
14615  441728                                       989128914705
15998  398641                                                  +
20626  441728  1424182024120413241220242284224420245323191289...


In [185]:
# Replace phone numbers with 'Unknown' for the two specific rows
df.loc[df.index.isin([15998, 20626]), 'Phone_Number'] = 'Unknown'

# Verify the change
print(df.loc[[15998, 20626], ['App_ID', 'Phone_Number']])


       App_ID Phone_Number
15998  398641      Unknown
20626  441728      Unknown


Verifying each column's consistent values and missing values per column

In [186]:
# Show unique values for each column
for col in df.columns:
    unique_vals = df[col].unique()
    print(f"Column: {col}")
    print(unique_vals)
    print(f"Total Unique: {len(unique_vals)}\n")


Column: App_ID
['12345' '347397' '358065' ... '98077675' '9876789' '425358']
Total Unique: 15175

Column: Country
['India' 'Nigeria' 'Unknown' 'Taiwan' 'Pakistan' 'Bangladesh' 'China'
 'South Africa' 'Afghanistan' 'Azerbaijan' 'Algeria' 'Armenia' 'Bahrain'
 'Bolivia' 'Botswana' 'Brazil' 'Cameroon' 'Burundi' 'Congo' 'Djibouti'
 'Egypt' 'Eritrea' 'Estonia' 'Ethiopia' 'Gambia' 'Ghana' 'Argentina'
 'Belarus' 'Finland' 'Germany' 'Canada' 'Iran' 'Nepal' 'Myanmar' 'Kenya'
 'South Korea' 'Ecuador' 'France' 'Yemen' 'Zambia' 'Zimbabwe' 'Sri Lanka'
 'Rwanda' 'Saudi Arabia' 'Indonesia' 'Turkey' 'Australia' 'Lebanon'
 'Kuwait' 'Malawi' 'Colombia' 'Cote Divoire' 'Angola' 'Cyprus' 'Dominica'
 'Czech Republic' 'United Arab Emirates' 'Uganda' 'Spain' 'Sierra Leone'
 'Somalia' 'Tanzania' 'Italy' 'Israel' 'Jordan' 'Kazakhstan' 'Mongolia'
 'Panama' 'Uzbekistan' 'Tajikistan' 'Syria' 'Sweden' 'Norway' 'Eswatini'
 'Russia' 'Sudan' 'Latvia' 'Morocco' 'Georgia' 'Mozambique' 'Liberia'
 'Malaysia' 'Hong Kong' 'V

Duplicate Rows Detecting

In [187]:
# Check duplicate rows
duplicate_rows = df[df.duplicated()]
print("Number of duplicate rows:", duplicate_rows.shape[0])

Number of duplicate rows: 16387


Removing Duplicate Rows

In [188]:
# Keep the first occurrence, drop the rest
df = df.drop_duplicates(keep='first')

# Check the shape after removal
print("Dataset shape after removing duplicates:", df.shape)


Dataset shape after removing duplicates: (16836, 4)


Verifying Dataset Overall 

In [189]:
# Summary verification function
def verify_dataset(df):
    print("=== Dataset Shape ===")
    print(df.shape, "\n")
    
    print("=== Missing Values ===")
    print(df.isna().sum(), "\n")
    
    print("=== Duplicate Rows ===")
    dup_count = df.duplicated().sum()
    print(f"Number of duplicate rows: {dup_count}\n")
    
    print("=== Column Datatypes ===")
    print(df.dtypes, "\n")
    
    print("=== Sample of Dataset ===")
    print(df.head())

# Run verification
verify_dataset(df)


=== Dataset Shape ===
(16836, 4) 

=== Missing Values ===
App_ID          0
Country         0
University      0
Phone_Number    0
dtype: int64 

=== Duplicate Rows ===
Number of duplicate rows: 0

=== Column Datatypes ===
App_ID          object
Country         object
University      object
Phone_Number    object
dtype: object 

=== Sample of Dataset ===
   App_ID  Country                        University  Phone_Number
0   12345    India  Illinois Institute of Technology    9823241234
1   12345    India  Illinois Institute of Technology    8805617501
2   12345    India  Illinois Institute of Technology   18019011222
3  347397  Nigeria  Illinois Institute of Technology    7738599513
4  347397  Nigeria  Illinois Institute of Technology  919182706838


Final verification:

Checked missing values → handled

Checked duplicates → removed

Verified datatypes → correct

Ensured Consistency + Standardization

Final dataset structure: 16836 unique rows, 4 columns.

Exporting Cleaned Dataset

In [190]:
df.to_csv("Cleaned_ApplicantData.csv", index=False)

print("Dataset successfully exported as 'Cleaned_ApplicantData.csv'")

Dataset successfully exported as 'Cleaned_ApplicantData.csv'


# Finally "ApplicantData.csv" Dataset is Cleaned !