In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, RocCurveDisplay, roc_curve, precision_score, f1_score, recall_score
from sklearn.svm import SVC
import re

## First, Let's Understand the Data

In this section, we will explore the dataset to gain insights into its structure, features, and content. Understanding the data is crucial for effective analysis and interpretation. We will examine aspects such as:

Data Types: Identifying the types of each feature (e.g., numerical, categorical).

Missing Values: Checking for any missing or null entries that may affect our analysis.

Basic Statistics: Calculating summary statistics (mean, median, mode) to understand the distribution of the data.

In [None]:
traindf = pd.read_csv('train.csv')

In [None]:
traindf.shape

(6548, 24)

In [None]:
traindf.head()

Unnamed: 0,Student ID,Age,Gender,Home Region,Home City,Program ID,Program Main Category Code,Program Sub Category Code,Technology Type,Program Skill Level,...,Completed Degree,Level of Education,Education Speaciality,College,University Degree Score,University Degree Score System,Employment Status,Job Type,Still Working,Y
0,4f14c50d-162e-4a15-9cf0-ec129c33bcf0,37.0,ذكر,منطقة الرياض,الرياض,453686d8-4023-4506-b2df-fac8b059ac26,PCRF,PCRF,,,...,نعم,البكالوريوس,هندسة حاسب الالي,,2.44,4.0,غير موظف,,,0
1,0599d409-876b-41a5-af05-749ef0e77d32,21.0,ذكر,منطقة عسير,خميس مشيط,cc8e4e42-65d5-4fa1-82f9-6c6c2d508b60,APMR,SWPS,,متوسط,...,نعم,البكالوريوس,الإذاعة والتلفزيون والفيلم,الفنون والعلوم الإنسانية,5.0,5.0,طالب,,,0
2,38a11c0e-4afc-4261-9c64-e94cc0a272fb,24.0,ذكر,منطقة الرياض,الرياض,e006900d-05a9-4c2b-a36f-0ffb9fce44cd,APMR,,,متوسط,...,نعم,البكالوريوس,Information Technology,,3.5,5.0,موظف,,,0
3,1693e85b-f80e-40ce-846f-395ddcece6d3,23.0,ذكر,منطقة الرياض,الرياض,2ec15f6b-233b-428a-b9f5-e40bc8d14cf9,TOSL,TOSL,,,...,نعم,البكالوريوس,حوسبة تطبيقية - (مسار شبكات الحاسب),,3.55,5.0,خريج,,,0
4,98a0e8d0-5f80-4634-afd8-322aa0902863,23.0,ذكر,منطقة الرياض,الرياض,d32da0e9-1aed-48c3-992d-a22f9ccc741e,CAUF,SWPS,تقليدية,متوسط,...,لا,البكالوريوس,نظم المعلومات الحاسوبية,تكنولوجيا الاتصالات والمعلومات,4.0,5.0,,,,0


In [None]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6548 entries, 0 to 6547
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Student ID                      6548 non-null   object 
 1   Age                             6456 non-null   float64
 2   Gender                          6548 non-null   object 
 3   Home Region                     6546 non-null   object 
 4   Home City                       6546 non-null   object 
 5   Program ID                      6548 non-null   object 
 6   Program Main Category Code      6548 non-null   object 
 7   Program Sub Category Code       5613 non-null   object 
 8   Technology Type                 3566 non-null   object 
 9   Program Skill Level             4902 non-null   object 
 10  Program Presentation Method     6548 non-null   object 
 11  Program Start Date              6548 non-null   object 
 12  Program End Date                65

##Converting Age from Decimal to Integer

In this part of the code, we will convert the age values from decimal format to natural numbers (integers).

In [None]:
traindf['Age'] = traindf['Age'].astype('Int64')

##Correcting Column Names with Spelling Errors

In this section, we will address any spelling errors in the column names of the dataset. Accurate column names are essential for clear data interpretation and analysis.

In [None]:
traindf.rename(columns={'Education Speaciality': 'Education Speciality'}, inplace=True)

## Identifying Unique Values for Non-Sequential Columns

In this section, we will identify the unique values for columns that do not contain sequential identifiers. We will exclude specific columns from this analysis.

Excluded Columns:

-Student ID

-Age

-Program ID

-Program Start Date

-Program End Date

-University Degree Score

In [None]:
excluded_columns = ['Student ID', 'Age', 'Program ID', 'Program Start Date', 'Program End Date', 'University Degree Score']

for column in traindf.columns:
    if column not in excluded_columns:
        unique_values = traindf[column].unique()
        print(f"Unique values for {column}: {unique_values}\n")

Unique values for Gender: ['ذكر' 'أنثى']

Unique values for Home Region: ['منطقة الرياض' 'منطقة عسير' 'المنطقة الشرقية' 'منطقة مكة المكرمة'
 'منطقة نجران' 'منطقة المدينة المنورة' 'منطقة القصيم' 'منطقة تبوك'
 'منطقة الحدود الشمالية' 'منطقة جازان' 'منطقة الباحة' 'منطقة حائل'
 'منطقة الجوف' nan]

Unique values for Home City: ['الرياض' 'خميس مشيط' 'حفر الباطن' 'جدة' 'نجران' 'مكة المكرمة' 'ينبع'
 'المدينة المنورة' 'الهفوف' 'بريدة' 'تبوك' 'العاصمة المقدسة' 'الأحساء'
 'الجبيل' 'عرعر' 'أبها' 'أبو عريش' 'الدمام' 'الطائف' 'الباحة'
 'أحد المسارحة' 'حائل' 'الدلم' 'القطيف' 'الدرعية' 'جيزان' 'جازان' 'الخبر'
 'المجمعة' 'الخرج' 'الظهران' 'الدوادمي' 'المزاحمية' 'عنيزة' 'القنفذة'
 'الليث' 'شرورة' 'الرس' 'صامطة' 'شقراء' 'سيهات' 'صفوى' 'سكاكا' 'البدائع'
 'رابغ' 'رفحاء' 'بيشة' 'محايل' 'المذنب' 'الخفجي' 'رأس تنورة' 'ضمد'
 'القريات' nan 'حوطة بني تميم' 'ضبا' 'مهد الذهب' 'بارق' 'بيش'
 'دومة الجندل' 'صبيا' 'العرضيات' 'الحناكية' 'سراة عبيدة' 'أحد رفيدة'
 'تربة' 'العلا' 'الحريملاء' 'بلجرشي' 'بقيق' 'القرى' 'رجال 

##Identifying Duplicate Rows

In this section, we will identify and extract the duplicate rows from the dataset. This is important for ensuring data quality and integrity.

In [None]:
duplicate_ids = traindf[traindf.duplicated()]
duplicate_ids

Unnamed: 0,Student ID,Age,Gender,Home Region,Home City,Program ID,Program Main Category Code,Program Sub Category Code,Technology Type,Program Skill Level,...,Completed Degree,Level of Education,Education Speciality,College,University Degree Score,University Degree Score System,Employment Status,Job Type,Still Working,Y
1145,cc92fa1e-494d-406f-8faf-9019d15ed5d0,37.0,ذكر,منطقة الرياض,الرياض,1c8a5fbd-9986-486f-9d9b-b50568f2589b,PCRF,PCRF,,متقدم,...,نعم,البكالوريوس,تقنية معلومات وحاسوب,,2.23,4.0,موظف,,,0
1801,7ab3d573-5a3e-4b13-bffe-9d0d4c2d5ef9,25.0,أنثى,المنطقة الشرقية,حفر الباطن,1f09a274-8f35-41a1-9a5e-61a5f1cb98fc,TOSL,,,متوسط,...,نعم,البكالوريوس,علوم الحاسبات,تكنولوجيا الاتصالات والمعلومات,3.0,4.0,غير موظف,تدريب,No,0
1825,d10937d0-e234-47c7-ae42-1a83961f0ad6,25.0,ذكر,منطقة الرياض,الرياض,321e4dc8-c3c4-4173-bca5-59a0bd9f327c,PCRF,PCRF,تقليدية,متوسط,...,نعم,البكالوريوس,علوم حاسب,,3.4,5.0,موظف,,,0
1842,a4be6795-4575-40a8-9787-ce34c2caa8be,27.0,ذكر,منطقة الرياض,الرياض,eaa49c92-77da-4177-9970-0146981a1b50,APMR,SRTA,تقليدية,متوسط,...,نعم,البكالوريوس,نظم المعلومات الإدارية,الأعمال والإدارة والقانون,4.0,5.0,موظف,دوام كامل,Yes,0
2186,d454af94-e89d-42fb-b738-c26f0d11b05c,34.0,ذكر,منطقة الرياض,الرياض,321e4dc8-c3c4-4173-bca5-59a0bd9f327c,PCRF,PCRF,تقليدية,متوسط,...,نعم,البكالوريوس,Computer Networking and Cyber security,,2.85,4.0,موظف,,,0
2247,6bb32b34-197f-4b7e-8bd4-ffa501b75558,26.0,أنثى,منطقة الرياض,الرياض,1f09a274-8f35-41a1-9a5e-61a5f1cb98fc,TOSL,,,متوسط,...,لا,البكالوريوس,علوم الحاسب,,3.1,5.0,غير موظف,,,0
2345,dd994829-ab39-4848-8736-f4cdb039091c,32.0,ذكر,منطقة الرياض,الرياض,321e4dc8-c3c4-4173-bca5-59a0bd9f327c,PCRF,PCRF,تقليدية,متوسط,...,نعم,البكالوريوس,مهندس حاسب,,2.45,5.0,موظف,,,0
2487,a72b93e9-7fdf-479b-9233-d37b08627b30,25.0,أنثى,منطقة الرياض,الرياض,32bce2ad-426a-4096-b7d0-78b5f30bd8a0,ABIR,INFA,ناشئة,مبتدئ,...,نعم,البكالوريوس,هندسة برمجيات,,4.26,5.0,موظف,,,0
2609,f2a20174-e002-45bf-a4a8-63be34062a60,24.0,ذكر,منطقة الباحة,الباحة,1f09a274-8f35-41a1-9a5e-61a5f1cb98fc,TOSL,,,متوسط,...,نعم,البكالوريوس,تقنية المعلومات,تكنولوجيا الاتصالات والمعلومات,2.51,4.0,موظف,تدريب,Yes,0
2886,d357f52a-2c6f-4401-89f8-752d4b20f93d,24.0,أنثى,منطقة عسير,خميس مشيط,daf89fda-a75f-487c-a30c-0b65de513e32,CAUF,,,مبتدئ,...,نعم,البكالوريوس,علوم الحاسبات,تكنولوجيا الاتصالات والمعلومات,4.0,5.0,موظف,دوام كامل,Yes,0


##Removing Exact Duplicate Rows

In this section, we will remove the exact duplicate rows from the dataset to ensure data integrity.

In [None]:
traindf.drop_duplicates(inplace=True)
print(traindf)

                                Student ID  Age Gender   Home Region  \
0     4f14c50d-162e-4a15-9cf0-ec129c33bcf0   37    ذكر  منطقة الرياض   
1     0599d409-876b-41a5-af05-749ef0e77d32   21    ذكر    منطقة عسير   
2     38a11c0e-4afc-4261-9c64-e94cc0a272fb   24    ذكر  منطقة الرياض   
3     1693e85b-f80e-40ce-846f-395ddcece6d3   23    ذكر  منطقة الرياض   
4     98a0e8d0-5f80-4634-afd8-322aa0902863   23    ذكر  منطقة الرياض   
...                                    ...  ...    ...           ...   
6543  cd196579-9590-441b-8787-41078f3cee25   31   أنثى  منطقة الرياض   
6544  37bfc11c-ff8c-42dc-9cf9-0d13bb8f7131   27   أنثى  منطقة القصيم   
6545  fc114302-a79f-439f-a08b-fe0a51cf839e   24   أنثى  منطقة الرياض   
6546  4b6d9a36-4402-4c75-bc3a-fca927dbaf65   25    ذكر  منطقة الرياض   
6547  008f3386-0d43-45a4-8372-b282e5a0101a   37   أنثى  منطقة الرياض   

      Home City                            Program ID  \
0        الرياض  453686d8-4023-4506-b2df-fac8b059ac26   
1     خميس مشيط  cc8e

## Cleaning Text in a Column

In this section, we will define a function to clean the text in the "Education Speciality" column by capitalizing the first letter of each word, while ensuring non-alphabetic words remain unchanged.

In [None]:
def clean_text(text):
    if isinstance(text, str):
        text = ' '.join(word.capitalize() if any(c.isalpha() for c in word) else word for word in text.split())
        return text
    return ''
traindf['Education Speciality'] = traindf['Education Speciality'].apply(clean_text)

##Standardizing City Names

In this section, we will standardize the names of cities in the "Home City" column to avoid duplicates and simplify searches and translations. We will also remove the definite article "ال" for easier processing.

Steps:

Correct Specific City Names:

Replace 'ابها' with 'أبها' for proper spelling.

Replace 'مدينة المنور' with 'مدينة المنورة' for accuracy.

Replace 'مكة مكرمة' with 'مكة المكرمة' for proper formatting.

Replace 'جيزان' with 'جازان' for consistency.

Remove the Definite Article:

Use str.replace('ال', '') to remove "ال" from city names, simplifying the text.

In [None]:
traindf['Home City'] = traindf['Home City'].replace('ابها', 'أبها')
traindf['Home City'] = traindf['Home City'].str.replace('ال', '')
traindf['Home City'] = traindf['Home City'].replace('مدينة المنور', 'مدينة المنورة')
traindf['Home City'] = traindf['Home City'].replace('مكة مكرمة', 'مكة المكرمة')
traindf['Home City'] = traindf['Home City'].replace('جيزان', 'جازان')

In [None]:
excluded_columns = ['Student ID', 'Age', 'Program ID', 'Program Start Date', 'Program End Date', 'University Degree Score']

for column in traindf.columns:
    if column not in excluded_columns:
        unique_values = traindf[column].unique()
        print(f"Unique values for {column}: {unique_values}\n")

Unique values for Gender: ['ذكر' 'أنثى']

Unique values for Home Region: ['منطقة الرياض' 'منطقة عسير' 'المنطقة الشرقية' 'منطقة مكة المكرمة'
 'منطقة نجران' 'منطقة المدينة المنورة' 'منطقة القصيم' 'منطقة تبوك'
 'منطقة الحدود الشمالية' 'منطقة جازان' 'منطقة الباحة' 'منطقة حائل'
 'منطقة الجوف' nan]

Unique values for Home City: ['رياض' 'خميس مشيط' 'حفر باطن' 'جدة' 'نجران' 'مكة المكرمة' 'ينبع'
 'مدينة منورة' 'هفوف' 'بريدة' 'تبوك' 'عاصمة مقدسة' 'أحساء' 'جبيل' 'عرعر'
 'أبها' 'أبو عريش' 'دمام' 'طائف' 'باحة' 'أحد مسارحة' 'حائل' 'دلم' 'قطيف'
 'درعية' 'جازان' 'خبر' 'مجمعة' 'خرج' 'ظهران' 'دوادمي' 'مزاحمية' 'عنيزة'
 'قنفذة' 'ليث' 'شرورة' 'رس' 'صامطة' 'شقراء' 'سيهات' 'صفوى' 'سكاكا' 'بدائع'
 'رابغ' 'رفحاء' 'بيشة' 'محايل' 'مذنب' 'خفجي' 'رأس تنورة' 'ضمد' 'قريات' nan
 'حوطة بني تميم' 'ضبا' 'مهد ذهب' 'بارق' 'بيش' 'دومة جندل' 'صبيا' 'عرضيات'
 'حناكية' 'سراة عبيدة' 'أحد رفيدة' 'تربة' 'علا' 'حريملاء' 'بلجرشي' 'بقيق'
 'قرى' 'رج ألمع' 'مجاردة' 'وادي فرع' 'محايل عسير' 'زلفي' 'بدر' 'تثليث'
 'عنك' 'بكيرية' 'بني حس

##Converting Values to Date and Time Format

In this section, we will convert the values in the "Program Start Date" and "Program End Date" columns to a unified date and time format. This is essential for accurate date manipulation and analysis.

In [None]:
traindf['Program Start Date'] = pd.to_datetime(traindf['Program Start Date'])
traindf['Program End Date'] = pd.to_datetime(traindf['Program End Date'])

##Printing Rows with Null Values

In this section, we will identify and print the rows in the dataset that contain null (NaN) values. This is important for understanding data completeness and quality.

In [None]:
missing = traindf.isnull().sum().sort_values(ascending=False)
print(missing[missing > 0])

Still Working                     4535
Job Type                          4535
College                           3862
Technology Type                   2958
Program Skill Level               1645
Program Sub Category Code          920
Employment Status                  557
Age                                 87
University Degree Score System      76
University Degree Score             76
Level of Education                  22
Home City                            2
Home Region                          2
dtype: int64


##Standardizing Language to English

In this section, we will convert all text entries in the dataset to English. This is important for consistency and ease of analysis.

In [None]:
translation_map = {
    'طالب': 'Student',
    'غير موظف': 'Unemployed',
    'موظف': 'Employed',
    'دوام كامل': 'Full-time',
    'دوام جزئي': 'Part-time',
    'نعم': 'Yes',
    'لا': 'No',
    'أنثى': 'Female',
    'ذكر': 'Male',
    'تقليدية': 'Traditional',
    'ناشئة': 'Emerging',
    'عن بعد': 'Remote',
    'حضوري': 'In-person',
    'داعمة': 'Supportive',
    'خريج': 'Graduate',
    'عمل حر': 'Freelance',
    'موظف - طالب': 'Employed - Student',
    'مختلطة': 'Hybrid',
    'متقدم': 'Advanced',
    'مبتدئ': 'Beginner',
    'متوسط': 'Intermediate',
    'البكالوريوس': 'Bachelor',
    'الثانوية العامة': 'High School',
    'الماجستير': 'Master',
    'الدبلوم': 'Diploma',
    'الدكتوراه': 'PhD',
    'تدريب': 'Internship',
    'ثانوي': 'Secondary',
}
traindf.replace(translation_map, inplace=True)

college_translation = {
    'الفنون والعلوم الإنسانية': 'Arts and Humanities',
    'تكنولوجيا الاتصالات والمعلومات': 'Information and Communication Technology',
    'العلوم الاجتماعية والصحافة والإعلام': 'Social Sciences, Journalism and Media',
    'العلوم الطبيعية والرياضيات والإحصاء': 'Natural Sciences, Mathematics and Statistics',
    'الهندسة والتصنيع والبناء': 'Engineering, Manufacturing and Construction',
    'الأعمال والإدارة والقانون': 'Business, Management and Law',
    'التعليم': 'Education',
    'الصحة والرفاة': 'Health and Wellness',
    'البرامج والمؤهلات العامة': 'General Programs and Qualifications',
    'العلوم الدينية': 'Religious Sciences'
}


home_region_translation = {
    'منطقة الرياض': 'Riyadh Region',
    'منطقة عسير': 'Asir Region',
    'المنطقة الشرقية': 'Eastern Province',
    'منطقة نجران': 'Najran Region',
    'منطقة مكة المكرمة': 'Makkah Region',
    'منطقة المدينة المنورة': 'Medina Region',
    'منطقة القصيم': 'Qassim Region',
    'منطقة تبوك': 'Tabuk Region',
    'منطقة جازان': 'Jazan Region',
    'منطقة حائل': 'Hail Region',
    'منطقة الباحة': 'Al-Baha Region',
    'منطقة الحدود الشمالية': 'Northern Borders Region',
    'منطقة الجوف': 'Al-Jawf Region'
}

home_city_translation = {
    'رياض': 'Riyadh',
    'خميس مشيط': 'Khamis Mushait',
    'حفر باطن': 'Hafr Al-Batin',
    'نجران': 'Najran',
    'مكة المكرمة': 'Makkah',
    'جدة': 'Jeddah',
    'ينبع': 'Yanbu',
    'مدينة منورة': 'Medina',
    'هفوف': 'Al-Hofuf',
    'بريدة': 'Buraidah',
    'تبوك': 'Tabuk',
    'عاصمة مقدسة': 'Holy Capital',
    'أحساء': 'Al-Ahsa',
    'جبيل': 'Jubail',
    'أبها': 'Abha',
    'أبو عريش': 'Abu Arish',
    'دمام': 'Dammam',
    'طائف': 'Taif',
    'أحد مسارحة': 'Ahd Musarrah',
    'حائل': 'Hail',
    'دلم': 'Dhulm',
    'قطيف': 'Qatif',
    'باحة': 'Al-Baha',
    'درعية': 'Diriyah',
    'جازان': 'Jazan',
    'مجمعة': 'Al-Majma\'ah',
    'خبر': 'Khobar',
    'خرج': 'Al-Khobar',
    'ظهران': 'Dhahran',
    'دوادمي': 'Dawadmi',
    'مزاحمية': 'Al-Muzahimiyah',
    'شرورة': 'Sharurah',
    'صامطة': 'Samta',
    'شقراء': 'Shuqairah',
    'سيهات': 'Saihat',
    'صفوى': 'Safwa',
    'سكاكا': 'Sakaka',
    'عنيزة': 'Unaizah',
    'قنفذة': 'Qunfudhah',
    'رابغ': 'Rabigh',
    'عرعر': 'Arar',
    'بيشة': 'Bisha',
    'محايل': 'Muhayil',
    'مذنب': 'Mudhnb',
    'خفجي': 'Khafji',
    'رس': 'Al-Ras',
    'رأس تنورة': 'Ras Tanura',
    'قريات': 'Qurayyat',
    'حوطة بني تميم': 'Hawta Bani Tamim',
    'ضبا': 'Duba',
    'رفحاء': 'Rafha',
    'بيش': 'Bish',
    'دومة جندل': 'Dumat Al-Jandal',
    'صبيا': 'Sabya',
    'عرضيات': 'Ardiyat',
    'حناكية': 'Hanakia',
    'سراة عبيدة': 'Saraat Abida',
    'أحد رفيدة': 'Ahd Rufaida',
    'تربة': 'Turbah',
    'علا': 'Ula',
    'حريملاء': 'Huraymila',
    'بلجرشي': 'Baljurashi',
    'بقيق': 'Buqayq',
    'قرى': 'Qura',
    'مجاردة': 'Majardah',
    'وادي فرع': 'WadiAlForaa',
    'زلفي': 'Zulfi',
    'بدر': 'Badr',
    'تثليث': 'Thulayyil',
    'عنك': 'Aynak',
    'بكيرية': 'Baqiriyah',
    'رياض خبراء': 'Riyadh Khubar',
    'ثادق': 'Thadiq',
    'درب': 'Darb',
    'غاط': 'Ghat',
    'طبرجل': 'Tabarjal',
    'طريف': 'Turaif',
    'ظهران جنوب': 'Dhahran South',
    'جموم': 'Jumum',
    'بدائع': 'badaya'
}

job_type_translation = {
    'تطوع': 'Volunteering',
}
traindf['College'] = traindf['College'].replace(college_translation)
traindf['Job Type'] = traindf['Job Type'].replace(job_type_translation)

traindf['Home Region'] = traindf['Home Region'].replace(home_region_translation)

traindf['Home City'] = traindf['Home City'].replace(home_city_translation)


### Filling Null Values Based on Already Existing Information

In [None]:
traindf.loc[traindf['Employment Status'].isin(['Unemployed', 'Student', 'Graduate']), 'Job Type'] = 'No Job'
traindf.loc[traindf['Employment Status'].isin(['Freelance']), 'Job Type'] = 'Flexible'
traindf.loc[traindf['Employment Status'].isin(['Employed']), 'Job Type'] = 'Full-time'
traindf.loc[traindf['Employment Status'].isin(['Employed', 'Freelance']), 'Still Working'] = 'Yes'
traindf.loc[traindf['Job Type'].isin(['No Job']), 'Still Working'] = 'No'

### Filling Missing Values Based on Technology Type Counts

In this section, we will calculate the total count of each value in the "Technology Type" column. This helps in understanding the distribution of technology types and filling in any missing values accordingly.

In [None]:
print((traindf['Technology Type'] == 'Traditional').sum())
print((traindf['Technology Type'] == 'Supportive').sum())
print((traindf['Technology Type'] == 'Emerging').sum())

2670
432
440


### Calculating the Percentage of Each Technology Type

In this section, we will calculate the percentage of each value in the "Technology Type" column to understand their distribution in the dataset.

In [None]:
t_percentage = (traindf['Technology Type'].eq('Traditional').sum() / traindf['Technology Type'].count()) * 100
print(f"Existing (Traditional) percentage: {t_percentage:.2f}%")
s_percentage = (traindf['Technology Type'].eq('Supportive').sum() / traindf['Technology Type'].count()) * 100
print(f"Exsiting (Supportive) percentage: {s_percentage:.2f}%")
e_percentage = (traindf['Technology Type'].eq('Emerging').sum() / traindf['Technology Type'].count()) * 100
print(f"Existing (Emerging) percentage: {e_percentage:.2f}%")
print((traindf['Technology Type']).isnull().sum())

Existing (Traditional) percentage: 75.38%
Exsiting (Supportive) percentage: 12.20%
Existing (Emerging) percentage: 12.42%
2958


### Filling Missing Values Based on Existing Percentages

In this section, we will fill the missing values in the "Technology Type" column according to the existing distribution percentages calculated earlier.

- Distribution Percentages:

Traditional: 75.52% (2236 entries)

Supportive: 12.11% (358 entries)

Emerging: 12.37% (364 entries)

Total Missing Values: 2982

- Filling Missing Values:

Select Indices of Missing Values.

Randomly Fill with Calculated Counts.

In [None]:

nan_indices = traindf[traindf['Technology Type'].isna()].index
random_fill = np.random.choice(nan_indices, size=2236, replace=False)
traindf.loc[random_fill, 'Technology Type'] = 'Traditional'

nan_indices = traindf[traindf['Technology Type'].isna()].index
random_fill = np.random.choice(nan_indices, size=358, replace=False)
traindf.loc[random_fill, 'Technology Type'] = 'Supportive'

nan_indices = traindf[traindf['Technology Type'].isna()].index
random_fill = np.random.choice(nan_indices, size=364, replace=False)
traindf.loc[random_fill, 'Technology Type'] = 'Emerging'


In [None]:
print((traindf['Technology Type']).isnull().sum())

0


### Calculating the Percentage of Each Program Skill Level

In this section, we will calculate the percentage of each value in the "Program Skill Level" column to understand their distribution in the dataset.

In [None]:
b_percentage = (traindf['Program Skill Level'].eq('Beginner').sum() / traindf['Program Skill Level'].count()) * 100
print(f"Existing (Beginner) percentage: {b_percentage:.2f}%")
i_percentage = (traindf['Program Skill Level'].eq('Intermediate').sum() / traindf['Program Skill Level'].count()) * 100
print(f"Exsiting (Intermediate) percentage: {i_percentage:.2f}%")
a_percentage = (traindf['Program Skill Level'].eq('Advanced').sum() / traindf['Program Skill Level'].count()) * 100
print(f"Existing (Advanced) percentage: {a_percentage:.2f}%")

print("Nulls: ",(traindf['Program Skill Level']).isnull().sum())

Existing (Beginner) percentage: 42.29%
Exsiting (Intermediate) percentage: 43.36%
Existing (Advanced) percentage: 14.36%
Nulls:  1645


### Filling Missing Values Based on Existing Percentages for Program Skill Level

In this section, we will fill the missing values in the "Program Skill Level" column according to the existing distribution percentages calculated earlier.

- Distribution Percentages:

Beginner: 42.02% (approximately 692 entries)

Intermediate: 43.53% (approximately 716 entries)

Advanced: 14.44% (approximately 237 entries)

Total Missing Values: 1646

- Filling Missing Values:

Select Indices of Missing Values.

Randomly Fill According to Calculated Counts.

In [None]:
nan_indices = traindf[traindf['Program Skill Level'].isna()].index
random_fill = np.random.choice(nan_indices, size=692, replace=False)
traindf.loc[random_fill, 'Program Skill Level'] = 'Beginner'

nan_indices = traindf[traindf['Program Skill Level'].isna()].index
random_fill = np.random.choice(nan_indices, size=716, replace=False)
traindf.loc[random_fill, 'Program Skill Level'] = 'Intermediate'

nan_indices = traindf[traindf['Program Skill Level'].isna()].index
random_fill = np.random.choice(nan_indices, size=237, replace=False)
traindf.loc[random_fill, 'Program Skill Level'] = 'Advanced'

In [None]:
print("Nulls: ",(traindf['Program Skill Level']).isnull().sum())

Nulls:  0


In [None]:
traindf[traindf['University Degree Score'].isna()]

Unnamed: 0,Student ID,Age,Gender,Home Region,Home City,Program ID,Program Main Category Code,Program Sub Category Code,Technology Type,Program Skill Level,...,Completed Degree,Level of Education,Education Speciality,College,University Degree Score,University Degree Score System,Employment Status,Job Type,Still Working,Y
146,755d7e1d-2ffb-4ce3-973d-133d1061a369,,Female,Riyadh Region,Riyadh,1c8a5fbd-9986-486f-9d9b-b50568f2589b,PCRF,PCRF,Traditional,Advanced,...,Yes,,,,,,,,,0
216,f842c0bc-a257-42fc-bf57-2e357f5f808a,,Male,Riyadh Region,Riyadh,fc4f3306-9a9a-4577-8262-2861edf5d3cb,CAUF,SWPS,Traditional,Beginner,...,Yes,Bachelor,,,,,,,,0
329,64e9528e-263e-4263-a5fe-8f870ef604bd,,Male,Riyadh Region,Riyadh,46e7d55e-f11d-47c3-8310-a907c1584671,ABIR,,Traditional,Beginner,...,Yes,Bachelor,,,,,,,,0
718,73eb52d3-4fee-43a1-a53c-f9e3b1a3d678,,Male,Riyadh Region,Riyadh,1c8a5fbd-9986-486f-9d9b-b50568f2589b,PCRF,PCRF,Traditional,Advanced,...,Yes,,,,,,,,,0
774,10faea0d-1392-47f8-bb89-7fed48975dc2,20,Female,Riyadh Region,Riyadh,e006900d-05a9-4c2b-a36f-0ffb9fce44cd,APMR,,Traditional,Intermediate,...,Yes,Bachelor,كلية الاقتصاد والعلوم الادارية - نظم معلومات ا...,,,,Student,No Job,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6365,10faea0d-1392-47f8-bb89-7fed48975dc2,20,Female,Riyadh Region,Riyadh,c1ade348-9775-4501-b050-252d5c5e6468,CAUF,SWPS,Traditional,Beginner,...,Yes,Bachelor,كلية الاقتصاد والعلوم الادارية - نظم معلومات ا...,,,,Student,No Job,No,0
6392,c1633694-7f8f-464d-ade1-103d0cbc7f4a,,Female,Riyadh Region,Riyadh,b0ee753e-5f35-4dc8-ae46-03af557569c0,APMR,,Emerging,Intermediate,...,Yes,Master,,,,,,,,0
6436,546d8f49-099c-4ed3-9e38-503ed7133905,,Female,Riyadh Region,Riyadh,33bc9c70-789e-46cd-a207-f1ee3a738f43,APMR,KLTM,Emerging,Intermediate,...,Yes,Secondary,,,,,,,,0
6490,7b78f755-1fdb-4477-a2b1-753485ad9020,,Female,Jazan Region,Jazan,65451000-39ff-4b0b-b0b4-bb2a67bcf9ed,GRST,INFA,Traditional,Intermediate,...,Yes,Bachelor,,,,,,,,0


In [None]:
traindf.dropna(subset=['University Degree Score'], inplace=True) # drop rows with missing values in 'University Degree Score' column

### Filling College Values Based on Education Speciality
The following code categorizes the "College" column based on the "Education Speciality" using predefined keywords for various fields.

- Concise breakdown of the implementation:

Define Keywords for Each College: Each college has specific keywords associated with it.

Create Patterns: Combine keywords into a pattern for searching.

Fill the College Column: Use the patterns to fill the "College" column based on matches found in the "Education Speciality".

In [None]:
# === الهندسة والتصنيع والبناء ===
keywordsENG = [
    'مدنية', 'المدنية', 'Civil', 'civil', 'مدنيه', 'المدنيه', 'كهربائية', 'الكهربائية', 'Electrical', 'electrical',
    'الكهربائيه', 'كهربائيه', 'كهربائي', 'ميكانيكية', 'الميكانيكية', 'Mechanical', 'mechanical', 'مكيانيكيه',
    'الميكانيكيه', 'صناعية', 'الصناعية', 'Industrial', 'industrial', 'الصناعيه', 'صناعيه', 'إلكترونية', 'الإلكترونية',
    'Electronics', 'electronics', 'الكترونيه', 'الالكترونيه', 'الكترونية', 'كيميائية', 'الكيميائية', 'Chemical', 'chemical',
    'كيميائيه', 'الكيميائيه', 'البترول', 'بترول', 'معمارية', 'معماريه', 'المعمارية', 'المعماريه', 'Architecture',
    'architecture', 'عماره', 'عمارة', 'هياكل', 'هندسة', 'هندسه', 'الهندسة', 'الهندسه', 'مساحة', 'آلات دقيقة وتحكم', 'Engineer In Marine Technology'
]
patternENG = '|'.join(keywordsENG)
traindf.loc[traindf['Education Speciality'].str.contains(patternENG, na=False), 'College'] = 'الهندسة والتصنيع والبناء'


# === تكنولوجيا الاتصالات والمعلومات ===
keywordsCS = [
    'Computer', 'CS', 'Cs', 'cs', 'Software', 'حاسب', 'برمجة', 'برمجيات', 'تكنولوجيا', 'تقنية', 'الحاسب', 'شبكات',
    'الشبكات', 'المعلومات', 'معلومات', 'نظم معلومات', 'نظم المعلومات', 'الحاسوبية', 'حاسوبية', 'أمن', 'سبراني',
    'سيبراني', 'الأمن', 'Information', 'Cybersecurity', 'ذكاء', 'الكترونيات', 'Is', 'IS', 'is', 'برمجه',
    'البرمجة', 'البرمجه', 'it', 'IT', 'It', 'swe', 'SWE', 'Swe', 'Cyber', 'cyber', 'cybersecurity', 'information',
    'Network', 'network', 'networks', 'Networking', 'net', 'Net', 'NET', 'Data', 'data', 'datascience', 'تقنيه',
    'اكترونيات', 'بيانات', 'البيانات', 'الذكاء الاصطناعي', 'الذكاء الصناعي', 'إلكترونيات', 'Computing', 'اتصالات',
    'الاتصالات', 'programming', 'Programming', 'حاسوب', 'الحاسوب', 'نظم', 'الكمبيوتر', 'كمبيوتر', 'Artificial Intelligence',
    'الحوسبة', 'الاصطناعي', 'Web', 'Softeare Engineering', 'دعم فني', 'علوم الحاشب', 'مهارات رقمية', 'Artifical Intellegence',
    'Conputer Scince', 'تطوير تطبيات', 'تطوير التطبيقات'
]
patternCS = '|'.join(keywordsCS)
traindf.loc[traindf['Education Speciality'].str.contains(patternCS, na=False), 'College'] = 'تكنولوجيا الاتصالات والمعلومات'


# === العلوم الطبيعية والرياضيات والإحصاء ===
keywordsSCI = [
    'احياء', 'الاحياء', 'أحياء', 'الأحياء', 'الجغرافيا', 'جغرافيا', 'رياضيات', 'الرياضيات',
    'كيمياء', 'الكيمياء', 'فيزياء', 'الفيزياء', 'إحصاء', 'الإحصاء', 'احصاء', 'الاحصاء',
    'Mathematics', 'mathematics', 'Mathematical', 'mathematical', 'Biology', 'biology',
    'Chemistry', 'chemistry', 'Physics', 'physics', 'Statistics', 'statistics', 'Geography', 'geography'
    'Geology', 'Space Science', 'Bsc Geology', 'جيولوجيا', 'علوم تطبيقيه'
]
patternSCI = '|'.join(keywordsSCI)
traindf.loc[traindf['Education Speciality'].str.contains(patternSCI, na=False), 'College'] = 'العلوم الطبيعية والرياضيات والإحصاء'


# === الصحة والرفاة ===
keywordsH = [
    'امراض', 'الامراض', 'Health', 'health', 'healthcare', 'Healthcare', 'تمريض', 'التمريض', 'التمريضية',
    'Nursing', 'nursing', 'صيدلة', 'Pharmacy', 'pharmacy', 'صيدله', 'الصيدلة', 'صحية', 'الصحة', 'الصحيه', 'الصحى',
    'اشعه', 'الاشعه', 'Radiology', 'radiology', 'الأشعة', 'أشعة', 'الأشعه', 'أشعه', 'Dental', 'Occupational Therapy',
    'علاج وظيفي', 'الاستشارات الوراثية', 'Respiratory Care', 'علوم أسرية', 'تجميل', 'Beauty', 'التغذية', 'بحوث العمليات', 'طب نووي', 'سكرتارية طبية'
]
patternH = '|'.join(keywordsH)
traindf.loc[traindf['Education Speciality'].str.contains(patternH, na=False), 'College'] = 'الصحة والرفاة'


# === الأعمال والإدارة والقانون ===
keywordsB = [
    'أعمال', 'Business', 'اعمال', 'محاسبة', 'Accounting', 'تمويل', 'Finance', 'تسويق', 'Marketing', 'البشرية', 'Human',
    'عامة', 'Public', 'ادارة', 'Management', 'إدارة', 'اداره', 'إداره', 'اقتصاد', 'Economics', 'إقتصاد', 'الاقتصاد',
    'الإقتصاد', 'قانون', 'Law', 'مشاريع', 'Project', 'ريادة الأعمال', 'Entrepreneurship', 'تجارة', 'E-Commerce',
    'بنوك', 'Banking', 'تأمين', 'Insurance', 'تحليل مالي', 'Financial Analysis', 'حوكمة', 'Corporate Governance',
    'موارد', 'مالية', 'ماليه', 'المالية', 'الماليه', 'حقوق', 'الحقوق', 'Emba', 'Mba', 'Pmp', 'علوم القرارات',
    'سكرتير تنفيذي', 'علاقات عامه', 'اتصال استراتيجي', 'الادلة الجنائية', 'Ebusiness', 'Actuarial Science', 'علوم امنية', 'قيادة وتخطيط'
]
patternB = '|'.join(keywordsB)
traindf.loc[traindf['Education Speciality'].str.contains(patternB, na=False), 'College'] = 'الأعمال والإدارة والقانون'


# === العلوم الاجتماعية والصحافة والإعلام ===
keywordsS = [
    'تاريخ', 'تاريخيه', 'تاريخية', 'التاريخ', 'التاريخيه', 'التاريخية', 'History',
    'صحافة', 'صحافه', 'الصحافة', 'الصحافه', 'Journalism', 'اعلام', 'الإعلام', 'والإعلام', 'Media',
    'اجتماعيات', 'Social Studies', 'اجتماعية', 'اجتماعيه', 'اجتماع', 'النفس', 'نفس', 'التوجية والاصلاح الاسري'
]
patternS = '|'.join(keywordsS)
traindf.loc[traindf['Education Speciality'].str.contains(patternS, na=False), 'College'] = 'العلوم الاجتماعية والصحافة والإعلام'


# === الفنون والعلوم الإنسانية ===
keywordsLan = [
    'انجليزي', 'إنجليزي', 'الانجليزي', 'الإنجليزي', 'English', 'انجليزيه', 'الانجليزيه', 'إنجليزيه', 'الإنجليزيه',
    'عربي', 'العربي', 'عربية', 'العربية', 'عربيه', 'العربيه', 'Arabic', 'لغه', 'اللغه', 'لغة', 'اللغة', 'اللغات',
    'لغات', 'ترجمة', 'ترجمه', 'الترجمة', 'الترجمه', 'فنية', 'فنيه', 'الفنية', 'الفنيه', 'Art Education',
    'Art And Design', 'Multimedia', 'Interiot', 'Design', 'تصميم', 'Product Design', 'تصميم المنتجات', 'تصميم داخلي',
    'تصميم ازياء', 'تصميم مواقع', 'فنون بصرية', 'فنون بصرية نحت', 'نحت فنون بصرية', 'الجرافيكس والوسائط المتعددة',
    'التصميم الجرافيكي والوسائط المتعددة', 'تصميم منتحات', 'تصميم الازياء', 'جرافيك ووسائط رقمية', 'وسائط متعددة تفاعلية',
    'تصميم جرافيكي ووسائط متعددة', 'الوسائط المتعددة ورسومات الويب', 'تصميم منتجات Product Design',
    'Products Design', 'Product Design', 'Language', 'اللغويات التطبيقية', 'لغويات تطبيقية'
]
patternLan = '|'.join(keywordsLan)
traindf.loc[traindf['Education Speciality'].str.contains(patternLan, na=False), 'College'] = 'الفنون والعلوم الإنسانية'

# === العلوم الدينية ===
keywordsRE = [
    'أصول الدين', 'اصول دين', 'القرأن الكريم والدراسات الاسلاميه',
    'دراسات إسلامية', 'دراسات اسلامية', 'دراسات اسلاميه', 'دراسات الإسلاميه',
    'الشريعة', 'دراسات وقضايا معاصرة'
]
patternRE = '|'.join(keywordsRE)
traindf.loc[traindf['Education Speciality'].str.contains(patternRE, na=False), 'College'] = 'العلوم الدينية'

# === التعليم ===
keywordsEdu = [
    'تعليم', 'التعليم', 'تقنيات التعليم', 'تقنيات تعليم', 'التعليم الإلكتروني', 'تعليم إلكتروني',
    'مناهج وطرق تدريس', 'المناهج وطرق التدريس',
    'رياض اطفال', 'تعليم ماقبل المرحلة الابتدائية',
    'تربية', 'تربوي', 'تربية خاصة', 'تربيه خاصة صعوبات تعلم', 'تربية خاصه', 'تربية فكريه',
    'Special Education', 'Education', 'Curriculum', 'Early Childhood',
    'Educational Technology', 'Kindergarten', 'Teacher', 'Teaching', 'المدربين'
]
patternEdu = '|'.join(keywordsEdu)
traindf.loc[traindf['Education Speciality'].str.contains(patternEdu, na=False), 'College'] = 'التعليم'


### Filling Missing Values in Program Sub Category Code

In this section, we will fill the missing values in the "Program Sub Category Code" column with the label "No sub." This is appropriate since a subcategory is not required to exist for every program.

In [None]:
traindf['Program Sub Category Code'] = traindf['Program Sub Category Code'].fillna('No sub')

### Finalizing the DataFrame

If you want to ensure that only rows with relevant data in the "Still Working" and "College" columns remain, you can drop the rows where both are missing.

In [None]:
traindf = traindf.dropna(subset=['Still Working', 'College'], how='all')
traindf

Unnamed: 0,Student ID,Age,Gender,Home Region,Home City,Program ID,Program Main Category Code,Program Sub Category Code,Technology Type,Program Skill Level,...,Completed Degree,Level of Education,Education Speciality,College,University Degree Score,University Degree Score System,Employment Status,Job Type,Still Working,Y
0,4f14c50d-162e-4a15-9cf0-ec129c33bcf0,37,Male,Riyadh Region,Riyadh,453686d8-4023-4506-b2df-fac8b059ac26,PCRF,PCRF,Traditional,Intermediate,...,Yes,Bachelor,هندسة حاسب الالي,تكنولوجيا الاتصالات والمعلومات,2.44,4.0,Unemployed,No Job,No,0
1,0599d409-876b-41a5-af05-749ef0e77d32,21,Male,Asir Region,Khamis Mushait,cc8e4e42-65d5-4fa1-82f9-6c6c2d508b60,APMR,SWPS,Supportive,Intermediate,...,Yes,Bachelor,الإذاعة والتلفزيون والفيلم,Arts and Humanities,5.00,5.0,Student,No Job,No,0
2,38a11c0e-4afc-4261-9c64-e94cc0a272fb,24,Male,Riyadh Region,Riyadh,e006900d-05a9-4c2b-a36f-0ffb9fce44cd,APMR,No sub,Traditional,Intermediate,...,Yes,Bachelor,Information Technology,تكنولوجيا الاتصالات والمعلومات,3.50,5.0,Employed,Full-time,Yes,0
3,1693e85b-f80e-40ce-846f-395ddcece6d3,23,Male,Riyadh Region,Riyadh,2ec15f6b-233b-428a-b9f5-e40bc8d14cf9,TOSL,TOSL,Traditional,Intermediate,...,Yes,Bachelor,حوسبة تطبيقية - (مسار شبكات الحاسب),تكنولوجيا الاتصالات والمعلومات,3.55,5.0,Graduate,No Job,No,0
4,98a0e8d0-5f80-4634-afd8-322aa0902863,23,Male,Riyadh Region,Riyadh,d32da0e9-1aed-48c3-992d-a22f9ccc741e,CAUF,SWPS,Traditional,Intermediate,...,No,Bachelor,نظم المعلومات الحاسوبية,تكنولوجيا الاتصالات والمعلومات,4.00,5.0,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6543,cd196579-9590-441b-8787-41078f3cee25,31,Female,Riyadh Region,Riyadh,4f8c696a-b783-4d40-9776-105f6d3bd624,CAUF,SWPS,Traditional,Beginner,...,Yes,Bachelor,تقنية المعلومات,تكنولوجيا الاتصالات والمعلومات,4.40,5.0,,,,0
6544,37bfc11c-ff8c-42dc-9cf9-0d13bb8f7131,27,Female,Qassim Region,Buraidah,e94942dd-8684-4746-97ae-df567b9b0a4a,PCRF,PCRF,Traditional,Beginner,...,Yes,Bachelor,علوم الحاسب,تكنولوجيا الاتصالات والمعلومات,4.46,5.0,Employed,Full-time,Yes,0
6545,fc114302-a79f-439f-a08b-fe0a51cf839e,24,Female,Riyadh Region,Riyadh,02ae0b47-64a6-47a1-b3c5-c0e4df393c30,PCRF,PCRF,Traditional,Beginner,...,No,Bachelor,نظم المعلومات,تكنولوجيا الاتصالات والمعلومات,4.93,5.0,Employed,Full-time,Yes,1
6546,4b6d9a36-4402-4c75-bc3a-fca927dbaf65,25,Male,Riyadh Region,Riyadh,9b4cedaa-fac0-4eac-aa4b-b05b6a0c97ff,PCRF,PCRF,Traditional,Intermediate,...,Yes,Bachelor,تقنية المعلومات,تكنولوجيا الاتصالات والمعلومات,4.00,4.0,Unemployed,No Job,No,0


In [None]:
missing = traindf.isnull().sum().sort_values(ascending=False)
print(missing[missing > 0])

Employment Status    478
Job Type             478
Still Working        478
College              200
Age                   15
Home Region            1
Home City              1
dtype: int64


### Filling College Column for High School Students

To fill the "College" column with "No colloge" for students who are in high school, you can use the following approach:

In [None]:
traindf.loc[traindf['Level of Education'] == 'Secondary', 'College'] = 'No College'

### Dropping Remaining Null Rows in College Column

If after filling the "College" column for high school students, you find that only 4 null rows remain, you can drop those rows to clean up your DataFrame.

In [None]:
traindf[traindf['College'].isna()]

Unnamed: 0,Student ID,Age,Gender,Home Region,Home City,Program ID,Program Main Category Code,Program Sub Category Code,Technology Type,Program Skill Level,...,Completed Degree,Level of Education,Education Speciality,College,University Degree Score,University Degree Score System,Employment Status,Job Type,Still Working,Y
1157,e26065dc-956f-4fcd-9d3c-1a30e38829c2,42,Male,Riyadh Region,Riyadh,3218820e-5fc3-4dcb-8c23-17ac8de5e4b0,GRST,INFA,Traditional,Intermediate,...,Yes,Diploma,علو,,3.0,4.0,Employed,Full-time,Yes,0
1670,a50e88b3-32a2-4de1-8c05-94e13d3d63c2,47,Male,Riyadh Region,Riyadh,4f8c696a-b783-4d40-9776-105f6d3bd624,CAUF,SWPS,Traditional,Beginner,...,Yes,PhD,Inter-iot,,4.85,5.0,Freelance,Flexible,Yes,0
2040,a631a556-19de-4c1a-8786-34f36c502a5c,27,Female,Riyadh Region,Riyadh,b9d1b38d-44e0-4943-a008-8cb527a88f3c,APMR,SRTA,Traditional,Intermediate,...,No,Master,E-business,,4.6,5.0,Employed,Full-time,Yes,1
2751,896ef0f6-744a-415d-b799-6eebddf944b8,19,Female,Riyadh Region,Riyadh,4957a013-46a7-419c-93d5-ebf3741ab2a1,PCRF,PCRF,Traditional,Beginner,...,No,Bachelor,تحضيري علمي,,4.88,5.0,Student,No Job,No,0
5628,854fd98d-a0c7-4197-8670-6f2301e9d476,21,Female,Riyadh Region,Riyadh,e6be9f1a-dffa-4662-a7d2-74a6e12ad4b2,CAUF,SWPS,Supportive,Beginner,...,Yes,Bachelor,السياحة,,5.0,5.0,Graduate,No Job,No,0
5776,9fedbc0b-f2e1-4171-b1fb-2b5e50aa2b55,22,Male,Riyadh Region,Riyadh,2ec15f6b-233b-428a-b9f5-e40bc8d14cf9,TOSL,TOSL,Traditional,Beginner,...,Yes,Bachelor,جامعة الملك سعود,,4.53,5.0,Student,No Job,No,0


### Dropping Rows with Missing Values in Specific Columns

To ensure that your DataFrame only contains rows with non-null values in the specified columns, you can drop rows where either "Still Working" or "College" has missing values.

In [None]:
traindf = traindf.dropna(subset=['Still Working'])
traindf = traindf.dropna(subset=['College'])

In [None]:
traindf[traindf['Age'].isna()]

Unnamed: 0,Student ID,Age,Gender,Home Region,Home City,Program ID,Program Main Category Code,Program Sub Category Code,Technology Type,Program Skill Level,...,Completed Degree,Level of Education,Education Speciality,College,University Degree Score,University Degree Score System,Employment Status,Job Type,Still Working,Y
141,3b792401-76ab-411a-bb84-b62ced139e64,,Female,Riyadh Region,Riyadh,3ec67853-6b57-48cc-9bc6-956e286a7c65,CAUF,ERST,Traditional,Intermediate,...,No,Bachelor,نظم المعلومات الادارية,تكنولوجيا الاتصالات والمعلومات,4.99,5.0,Graduate,No Job,No,1
523,822d6986-8a43-4342-928f-ebf8b0014f05,,Male,Eastern Province,Jubail,5364b84a-0ed3-4e1c-9b7d-5a5cf1e14b20,CAUF,SWPS,Traditional,Beginner,...,Yes,Secondary,,No College,88.0,100.0,Student,No Job,No,0
1349,822d6986-8a43-4342-928f-ebf8b0014f05,,Male,Eastern Province,Jubail,ca70e847-57d2-4aad-862a-423e3076577a,CAUF,SWPS,Traditional,Intermediate,...,Yes,Secondary,,No College,88.0,100.0,Student,No Job,No,0
4340,3b792401-76ab-411a-bb84-b62ced139e64,,Female,Riyadh Region,Riyadh,4ad7a7dc-abcf-43fa-b2e3-07095e083661,PCRF,PCRF,Traditional,Advanced,...,No,Bachelor,نظم المعلومات الادارية,تكنولوجيا الاتصالات والمعلومات,4.99,5.0,Graduate,No Job,No,1
4377,03fb5ad5-55b3-4064-b8df-313bec5b2b10,,Male,Riyadh Region,Riyadh,67bbe41c-20c2-4799-9e15-eaae2e94f157,APMR,No sub,Traditional,Intermediate,...,Yes,Secondary,,No College,99.0,100.0,Student,No Job,No,0
4904,0199e322-e968-4637-8fee-5cc622c3adcc,,Female,Riyadh Region,Riyadh,3ec67853-6b57-48cc-9bc6-956e286a7c65,CAUF,ERST,Traditional,Intermediate,...,Yes,Bachelor,Management Information System (mis),الأعمال والإدارة والقانون,4.9,5.0,Graduate,No Job,No,0
5013,782c9649-1b29-452a-bb95-bdd2f89db042,,Male,Riyadh Region,Riyadh,f2374bcb-111c-402d-beaf-37433687db4d,TOSL,No sub,Traditional,Beginner,...,Yes,Secondary,,No College,99.13,100.0,Student,No Job,No,0
5049,782c9649-1b29-452a-bb95-bdd2f89db042,,Male,Riyadh Region,Riyadh,9144fab5-59f3-4b98-bc3c-0784a04883fe,TOSL,TOSL,Supportive,Advanced,...,Yes,Secondary,,No College,99.13,100.0,Student,No Job,No,0
5132,1f0c3148-3705-45a0-b171-5b2a9334595a,,Male,Asir Region,Abha,013916fa-85bd-46bc-80c8-f37c036fd142,PCRF,PCRF,Traditional,Advanced,...,Yes,Secondary,,No College,97.0,100.0,Student,No Job,No,0
5348,03fb5ad5-55b3-4064-b8df-313bec5b2b10,,Male,Riyadh Region,Riyadh,e006900d-05a9-4c2b-a36f-0ffb9fce44cd,APMR,No sub,Traditional,Intermediate,...,Yes,Secondary,,No College,99.0,100.0,Student,No Job,No,0


In [None]:
missing = traindf.isnull().sum().sort_values(ascending=False)
print(missing[missing > 0])

Age            11
Home Region     1
Home City       1
dtype: int64


### Filling Age for Secondary School Students

To fill the "Age" column with random values between 16 and 18 for students in secondary school who currently have missing age values, you can use the following code:

In [None]:
mask_secondary = (traindf['Level of Education'] == 'Secondary') & (traindf['Age'].isna())
traindf.loc[mask_secondary, 'Age'] = np.random.randint(16, 19, size=mask_secondary.sum())

### Filling Age for Bachelor Students

To fill the "Age" column with random values between 18 and 30 for students in the bachelor program who currently have missing age values, you can use the following code:

In [None]:
mask_bachelor = (traindf['Level of Education'] == 'Bachelor') & (traindf['Age'].isna())
traindf.loc[mask_bachelor, 'Age'] = np.random.randint(18, 31, size=mask_bachelor.sum())

In [None]:
missing = traindf.isnull().sum().sort_values(ascending=False) # Check null values and sort them
print(missing[missing > 0])

Home Region    1
Home City      1
dtype: int64


### Checking and Dropping Rows with Null Values in Home Region and Home City

If you find that both "Home Region" and "Home City" are null for certain rows, and since you cannot determine their values, you can drop these rows from the DataFrame.

In [None]:
print(traindf[traindf['Home Region'].isnull() & traindf['Home City'].isnull()])

                                Student ID  Age Gender Home Region Home City  \
1864  cc394a25-74ed-43f9-92bc-e0021fc969c5   45   Male         NaN       NaN   

                                Program ID Program Main Category Code  \
1864  ce3562c8-8d27-4ffb-8dfc-e1dd4527b32a                       PCRF   

     Program Sub Category Code Technology Type Program Skill Level  ...  \
1864                      PCRF     Traditional            Beginner  ...   

     Completed Degree Level of Education    Education Speciality  \
1864              Yes             Master  نظم المعلومات الإدارية   

                             College University Degree Score  \
1864  تكنولوجيا الاتصالات والمعلومات                     4.0   

     University Degree Score System Employment Status   Job Type  \
1864                            5.0          Employed  Full-time   

      Still Working  Y  
1864            Yes  0  

[1 rows x 24 columns]


In [None]:
traindf = traindf[~(traindf['Home Region'].isnull() & traindf['Home City'].isnull())] # Dropping Home City, since Home Region is also null so there is no way of knowing the city.

In [None]:
missing = traindf.isnull().sum().sort_values(ascending=False)
print(missing[missing > 0])

Series([], dtype: int64)


In [None]:
traindf

Unnamed: 0,Student ID,Age,Gender,Home Region,Home City,Program ID,Program Main Category Code,Program Sub Category Code,Technology Type,Program Skill Level,...,Completed Degree,Level of Education,Education Speciality,College,University Degree Score,University Degree Score System,Employment Status,Job Type,Still Working,Y
0,4f14c50d-162e-4a15-9cf0-ec129c33bcf0,37,Male,Riyadh Region,Riyadh,453686d8-4023-4506-b2df-fac8b059ac26,PCRF,PCRF,Traditional,Intermediate,...,Yes,Bachelor,هندسة حاسب الالي,تكنولوجيا الاتصالات والمعلومات,2.44,4.0,Unemployed,No Job,No,0
1,0599d409-876b-41a5-af05-749ef0e77d32,21,Male,Asir Region,Khamis Mushait,cc8e4e42-65d5-4fa1-82f9-6c6c2d508b60,APMR,SWPS,Supportive,Intermediate,...,Yes,Bachelor,الإذاعة والتلفزيون والفيلم,Arts and Humanities,5.00,5.0,Student,No Job,No,0
2,38a11c0e-4afc-4261-9c64-e94cc0a272fb,24,Male,Riyadh Region,Riyadh,e006900d-05a9-4c2b-a36f-0ffb9fce44cd,APMR,No sub,Traditional,Intermediate,...,Yes,Bachelor,Information Technology,تكنولوجيا الاتصالات والمعلومات,3.50,5.0,Employed,Full-time,Yes,0
3,1693e85b-f80e-40ce-846f-395ddcece6d3,23,Male,Riyadh Region,Riyadh,2ec15f6b-233b-428a-b9f5-e40bc8d14cf9,TOSL,TOSL,Traditional,Intermediate,...,Yes,Bachelor,حوسبة تطبيقية - (مسار شبكات الحاسب),تكنولوجيا الاتصالات والمعلومات,3.55,5.0,Graduate,No Job,No,0
5,6b27cd77-23aa-438b-bd0b-7abdb40a717c,30,Male,Riyadh Region,Riyadh,32bce2ad-426a-4096-b7d0-78b5f30bd8a0,ABIR,INFA,Emerging,Beginner,...,Yes,Bachelor,هندسة حاسب آلي,تكنولوجيا الاتصالات والمعلومات,4.07,5.0,Employed,Full-time,Yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6542,537cbf47-e5c8-4ff8-a636-ea3aa34c36b0,25,Female,Riyadh Region,Riyadh,bf68a9ad-0543-4e6e-8a42-695953db697a,CAUF,SWPS,Emerging,Intermediate,...,Yes,Bachelor,علوم الحاسب,تكنولوجيا الاتصالات والمعلومات,4.20,5.0,Employed,Full-time,Yes,0
6544,37bfc11c-ff8c-42dc-9cf9-0d13bb8f7131,27,Female,Qassim Region,Buraidah,e94942dd-8684-4746-97ae-df567b9b0a4a,PCRF,PCRF,Traditional,Beginner,...,Yes,Bachelor,علوم الحاسب,تكنولوجيا الاتصالات والمعلومات,4.46,5.0,Employed,Full-time,Yes,0
6545,fc114302-a79f-439f-a08b-fe0a51cf839e,24,Female,Riyadh Region,Riyadh,02ae0b47-64a6-47a1-b3c5-c0e4df393c30,PCRF,PCRF,Traditional,Beginner,...,No,Bachelor,نظم المعلومات,تكنولوجيا الاتصالات والمعلومات,4.93,5.0,Employed,Full-time,Yes,1
6546,4b6d9a36-4402-4c75-bc3a-fca927dbaf65,25,Male,Riyadh Region,Riyadh,9b4cedaa-fac0-4eac-aa4b-b05b6a0c97ff,PCRF,PCRF,Traditional,Intermediate,...,Yes,Bachelor,تقنية المعلومات,تكنولوجيا الاتصالات والمعلومات,4.00,4.0,Unemployed,No Job,No,0


### Dropping the Education Specialty Column


To remove the "Education Specialty" column from the DataFrame due to its inconsistent values and non-essential nature for prediction, you can use the following code:

In [None]:
traindf.drop(columns=['Education Speciality'], inplace=True)

print(traindf.head())

                             Student ID  Age Gender    Home Region  \
0  4f14c50d-162e-4a15-9cf0-ec129c33bcf0   37   Male  Riyadh Region   
1  0599d409-876b-41a5-af05-749ef0e77d32   21   Male    Asir Region   
2  38a11c0e-4afc-4261-9c64-e94cc0a272fb   24   Male  Riyadh Region   
3  1693e85b-f80e-40ce-846f-395ddcece6d3   23   Male  Riyadh Region   
5  6b27cd77-23aa-438b-bd0b-7abdb40a717c   30   Male  Riyadh Region   

        Home City                            Program ID  \
0          Riyadh  453686d8-4023-4506-b2df-fac8b059ac26   
1  Khamis Mushait  cc8e4e42-65d5-4fa1-82f9-6c6c2d508b60   
2          Riyadh  e006900d-05a9-4c2b-a36f-0ffb9fce44cd   
3          Riyadh  2ec15f6b-233b-428a-b9f5-e40bc8d14cf9   
5          Riyadh  32bce2ad-426a-4096-b7d0-78b5f30bd8a0   

  Program Main Category Code Program Sub Category Code Technology Type  \
0                       PCRF                      PCRF     Traditional   
1                       APMR                      SWPS      Supportive   
2 

### Feature Engineering: Creating Age Range by Program

To create a new column that calculates the age range for each program based on the minimum and maximum ages, you can follow these steps:

In [None]:
age_ranges = traindf.groupby('Program ID')['Age'].agg(['min', 'max']).reset_index()

age_ranges['Age Range by Program'] = age_ranges['min'].astype(str) + '-' + age_ranges['max'].astype(str)

traindf = traindf.merge(age_ranges[['Program ID', 'Age Range by Program']], on='Program ID', how='left')


### Feature Engineering: Calculating Program Duration

To create a new feature that calculates the duration of each program in days based on the start and end dates, you can use the following code:

In [None]:

traindf['Program Start Date'] = pd.to_datetime(traindf['Program Start Date'])
traindf['Program End Date'] = pd.to_datetime(traindf['Program End Date'])

traindf['Program Duration'] = (traindf['Program End Date'] - traindf['Program Start Date']).dt.days

print(traindf.head())

                             Student ID  Age Gender    Home Region  \
0  4f14c50d-162e-4a15-9cf0-ec129c33bcf0   37   Male  Riyadh Region   
1  0599d409-876b-41a5-af05-749ef0e77d32   21   Male    Asir Region   
2  38a11c0e-4afc-4261-9c64-e94cc0a272fb   24   Male  Riyadh Region   
3  1693e85b-f80e-40ce-846f-395ddcece6d3   23   Male  Riyadh Region   
4  6b27cd77-23aa-438b-bd0b-7abdb40a717c   30   Male  Riyadh Region   

        Home City                            Program ID  \
0          Riyadh  453686d8-4023-4506-b2df-fac8b059ac26   
1  Khamis Mushait  cc8e4e42-65d5-4fa1-82f9-6c6c2d508b60   
2          Riyadh  e006900d-05a9-4c2b-a36f-0ffb9fce44cd   
3          Riyadh  2ec15f6b-233b-428a-b9f5-e40bc8d14cf9   
4          Riyadh  32bce2ad-426a-4096-b7d0-78b5f30bd8a0   

  Program Main Category Code Program Sub Category Code Technology Type  \
0                       PCRF                      PCRF     Traditional   
1                       APMR                      SWPS      Supportive   
2 

In [None]:
traindf.head(5)

Unnamed: 0,Student ID,Age,Gender,Home Region,Home City,Program ID,Program Main Category Code,Program Sub Category Code,Technology Type,Program Skill Level,...,Level of Education,College,University Degree Score,University Degree Score System,Employment Status,Job Type,Still Working,Y,Age Range by Program,Program Duration
0,4f14c50d-162e-4a15-9cf0-ec129c33bcf0,37,Male,Riyadh Region,Riyadh,453686d8-4023-4506-b2df-fac8b059ac26,PCRF,PCRF,Traditional,Intermediate,...,Bachelor,Information and Communication Technology,2.44,4.0,Unemployed,No Job,No,0,24-37,11
1,0599d409-876b-41a5-af05-749ef0e77d32,21,Male,Asir Region,Khamis Mushait,cc8e4e42-65d5-4fa1-82f9-6c6c2d508b60,APMR,SWPS,Supportive,Intermediate,...,Bachelor,Arts and Humanities,5.0,5.0,Student,No Job,No,0,20-46,4
2,38a11c0e-4afc-4261-9c64-e94cc0a272fb,24,Male,Riyadh Region,Riyadh,e006900d-05a9-4c2b-a36f-0ffb9fce44cd,APMR,No sub,Traditional,Intermediate,...,Bachelor,Information and Communication Technology,3.5,5.0,Employed,Full-time,Yes,0,16-33,53
3,1693e85b-f80e-40ce-846f-395ddcece6d3,23,Male,Riyadh Region,Riyadh,2ec15f6b-233b-428a-b9f5-e40bc8d14cf9,TOSL,TOSL,Traditional,Intermediate,...,Bachelor,Information and Communication Technology,3.55,5.0,Graduate,No Job,No,0,21-38,32
4,6b27cd77-23aa-438b-bd0b-7abdb40a717c,30,Male,Riyadh Region,Riyadh,32bce2ad-426a-4096-b7d0-78b5f30bd8a0,ABIR,INFA,Emerging,Beginner,...,Bachelor,Information and Communication Technology,4.07,5.0,Employed,Full-time,Yes,0,20-43,11


In [None]:
traindf.to_csv('train_cleaned.csv', index=False)