In [1]:
import pandas as pd
import re

Overall Purpose
- Clean and standardize raw student data for analysis.
- Ensure numeric columns are correct and usable.
- Remove duplicates and irrelevant records.
- Prepare a clean dataset for further analysis like ranking, statistics, or visualization.

In [None]:
df = pd.read_csv("Private_data.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,NAME OF THE STUDENT,UNIVERSITY,PROGRAM NAME,Specialisation,SEMESTER,Domain,GENERAL MANAGEMENT SCORE (OUT of 50),Domain Specific SCORE (OUT 50),TOTAL SCORE (OUT of 100),RANK,PERCENTILE
0,0,Camila Wood,"Stanford University, USA",B.Com,Honours,5th,Finance,50,50,100,1.0,1.0
1,1,Alexander Thompson,"Stanford University, USA",B.Com,Financial Services,5th,Finance,50,50,100,2.0,0.993377
2,2,Liam Taylor,"Harvard University, USA",B.Com,Accounting Analytics,5th,BA,50,50,100,3.0,0.986755
3,3,Evelyn Jenkins,"Stanford University, USA",B.Com,Honours,5th,Finance,49,50,99,4.0,0.980132
4,4,Michael Jackson,"Harvard University, USA",MBA,International Business,3rd,IB,50,49,99,5.0,0.97351
5,5,Chloe Moore,"Stanford University, USA",B.Com,Honours,5th,Finance,50,48,98,6.0,0.966887
6,6,Nicholas Clark,"Harvard University, USA",MBA,International Business,3rd,IB,49,49,98,7.0,0.960265
7,7,Olivia Richardson,"Stanford University, USA",BBA,International Business,5th,IB,50,47,97,8.0,0.953642
8,8,Aiden Rivera,"Stanford University, USA",B.Com,Honours,5th,Finance,50,47,97,9.0,0.94702
9,9,Harper Turner,"Harvard University, USA",MBA,International Business,3rd,IB,49,48,97,10.0,0.940397


In [None]:
file = "Private_data"
clean_tb_name = file.lower().replace(" ", "_").replace("?", "")\
.replace(r"/", "").replace("\\", "").replace("-", "_")\
.replace(")", "").replace("(", "").replace("%", "").replace(r",", "")

Converts the file name into a safe, standardized format (all lowercase, underscores instead of spaces, removes special characters).

Prevents issues with saving/loading files on different systems or using them as variable names.

In [None]:

clean_tb_name

'private_data'

In [None]:
df.columns = [x.lower().replace(" ", "_").replace("?", "")\
.replace(r"/", "").replace("\\", "").replace("-", "_")\
.replace("%", "").replace(r",", "") for x in df.columns]

Standardizes column names (lowercase, underscores, no special characters).

Makes it easier to reference columns in code consistently, avoiding errors due to spaces or special characters.

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 12 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   unnamed:_0                            151 non-null    int64  
 1   name_of_the_student                   151 non-null    object 
 2   university                            151 non-null    object 
 3   program_name                          151 non-null    object 
 4   specialisation                        151 non-null    object 
 5   semester                              151 non-null    object 
 6   domain                                151 non-null    object 
 7   general_management_score_(out_of_50)  151 non-null    int64  
 8   domain_specific_score_(out_50)        150 non-null    object 
 9   total_score_(out_of_100)              151 non-null    object 
 10  rank                                  151 non-null    float64
 11  percentile         

Checks the types of each column and non-null counts.

Helps identify columns that need type conversion (e.g., numeric vs string) and missing data.

In [None]:
non_numeric_rows = df[~df['total_score_(out_of_100)'].str.replace('.', '', regex=False).str.isnumeric()]

print(non_numeric_rows)

     unnamed:_0 name_of_the_student                university program_name  \
77           77         Sophia King  Stanford University, USA          MBA   
86           86      Harper Jackson   Harvard University, USA          MBA   
139         139        Olivia White   Harvard University, USA        B.Com   

             specialisation semester domain  \
77        Digital Marketing      3rd     DM   
86   International Business      3rd     IB   
139      Business Analytics      5th     BA   

     general_management_score_(out_of_50) domain_specific_score_(out_50)  \
77                                     38                             36   
86                                     50                             22   
139                                    21                              9   

    total_score_(out_of_100)   rank  percentile  
77                         x   78.0    0.490066  
86                        -4   87.0    0.430464  
139                      aaa  140.0    0.07

Identifies rows where numeric columns (total_score_(out_of_100)) have invalid entries like x, -4, aaa.

Ensures data consistency before numeric operations; prevents errors in calculations or analysis.

In [None]:
df['domain_specific_score_(out_50)'] = pd.to_numeric(df['domain_specific_score_(out_50)'], errors='coerce')
df['total_score_(out_of_100)'] = pd.to_numeric(df['total_score_(out_of_100)'], errors='coerce')
df['percentile'] = pd.to_numeric(df['percentile'], errors='coerce')


Forces the columns to numeric type; non-numeric entries become NaN.

Allows arithmetic and statistical operations on scores, but introduces NaN where data was invalid.

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 12 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   unnamed:_0                            151 non-null    int64  
 1   name_of_the_student                   151 non-null    object 
 2   university                            151 non-null    object 
 3   program_name                          151 non-null    object 
 4   specialisation                        151 non-null    object 
 5   semester                              151 non-null    object 
 6   domain                                151 non-null    object 
 7   general_management_score_(out_of_50)  151 non-null    int64  
 8   domain_specific_score_(out_50)        149 non-null    float64
 9   total_score_(out_of_100)              149 non-null    float64
 10  rank                                  151 non-null    float64
 11  percentile         

Видаляю лічильник, так як за замовченням є нумерація

In [None]:
df = df.drop(columns=['unnamed:_0'])

In [None]:
duplic_rows_by_name = df.duplicated(subset='name_of_the_student')
print(df[duplic_rows_by_name])

    name_of_the_student                university   program_name  \
34         Emma Stewart  Stanford University, USA            BBA   
64          Carter Hall   Harvard University, USA  Intg. BBA+MBA   
73           Aria Evans  Stanford University, USA            BBA   
77          Sophia King  Stanford University, USA            MBA   
109         Liam Taylor   Harvard University, USA            MBA   
125          Grace Hall   Harvard University, USA            MBA   
127          Lily Davis  Stanford University, USA            BBA   
137     Scarlett Cooper  University of Oxford, UK  Intg. BBA+MBA   
138           Mia Young  Stanford University, USA            BBA   
150      Abigail Bailey   Harvard University, USA            MBA   

                                        specialisation semester   domain  \
34                                  Financial Services      5th  Finance   
64                              International Business      3rd       IB   
73                     

Ensures each student appears only once.

Prevents double-counting in analysis and statistics.

In [None]:
df = df.drop_duplicates()

Видаляю усі рядки, що мають не той формат

In [None]:
df = df.dropna()

Видалаяю рядки, де програма НЕ BBA або MBA, як вказано у інструкції до практичної

In [None]:
df = df[df['program_name'].apply(lambda x: x in ['BBA', 'MBA'])]

In [None]:
df = df.reset_index(drop=True)
df.index += 1

In [None]:
df.head(20)

Unnamed: 0,name_of_the_student,university,program_name,specialisation,semester,domain,general_management_score_(out_of_50),domain_specific_score_(out_50),total_score_(out_of_100),rank,percentile
1,Michael Jackson,"Harvard University, USA",MBA,International Business,3rd,IB,50,49.0,99.0,5.0,0.97351
2,Nicholas Clark,"Harvard University, USA",MBA,International Business,3rd,IB,49,49.0,98.0,7.0,0.960265
3,Olivia Richardson,"Stanford University, USA",BBA,International Business,5th,IB,50,47.0,97.0,8.0,0.953642
4,Harper Turner,"Harvard University, USA",MBA,International Business,3rd,IB,49,48.0,97.0,10.0,0.940397
5,Madison Lee,"Stanford University, USA",BBA,Business Analytics,5th,BA,47,50.0,197.0,11.0,0.933775
6,Carter Hall,"Harvard University, USA",MBA,International Business,3rd,IB,50,47.0,97.0,12.0,0.927152
7,Abigail Bailey,"Harvard University, USA",MBA,International Business,3rd,IB,47,48.0,95.0,15.0,0.907285
8,Joseph Diaz,"Harvard University, USA",MBA,International Business,3rd,IB,49,46.0,95.0,16.0,0.900662
9,William Walker,"University of Cambridge, UK",MBA,Information Technology with Accounting Analytics,5th,BA,48,47.0,95.0,17.0,0.89404
10,Avery Gonzalez,"Harvard University, USA",MBA,Innovation Entrepreneurship and Venture Develo...,3rd,E&I,49,43.0,92.0,20.0,0.874172


Перевіряємо кількість рядків

такі результати були на початку

RangeIndex: 151 entries, 0 to 150

Data columns (total 12 columns):

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97 entries, 1 to 97
Data columns (total 11 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   name_of_the_student                   97 non-null     object 
 1   university                            97 non-null     object 
 2   program_name                          97 non-null     object 
 3   specialisation                        97 non-null     object 
 4   semester                              97 non-null     object 
 5   domain                                97 non-null     object 
 6   general_management_score_(out_of_50)  97 non-null     int64  
 7   domain_specific_score_(out_50)        97 non-null     float64
 8   total_score_(out_of_100)              97 non-null     float64
 9   rank                                  97 non-null     float64
 10  percentile                            97 non-null     float64
dtypes: float64(4), int64(

In [None]:
df.to_csv('Private_data1.csv', index=False)