## DATA PRE-PROCESSING

In [None]:
import pandas as pd

# Load the dataset
file_path = 'Survey for AL stream model.csv'
data = pd.read_csv(file_path)

In [160]:
data.shape

(523, 20)

In [161]:
# Adjust the columns_to_drop list accordingly
columns_to_drop = ['Timestamp', 'Which higher education field are you currently following or have you completed? ', 'Column 15', 'If so, what kind of job do you do? ', 'On a scale of 1 to 5, how satisfied are you with your current job?  ', 'If you had the opportunity, would you prefer to do a different job? If yes, what job would you like to do?  ', 'What advice would you give to students selecting their A/L stream? Additionally, do you have any feedback or suggestions for improving this project? ']

# Remove specified columns if they exist in the DataFrame
data = data.drop(columns=[col for col in columns_to_drop if col in data.columns])

In [162]:
# Rename columns
data = data.rename(columns={
    'Kindly input your O/L exam results [Mathematics]': 'Maths',
    'Kindly input your O/L exam results [Science]': 'Science',
    'Kindly input your O/L exam results [Religion]': 'Religion',
    'Kindly input your O/L exam results [English]': 'English',
    'Kindly input your O/L exam results [Sinhala or Tamil]': 'Sinhala or Tamil',
    'Kindly input your O/L exam results [History]': 'History',
    'Kindly input your O/L exam results [Basket I]': 'Basket I',
    'Kindly input your O/L exam results [Basket II]': 'Basket II',
    'Kindly input your O/L exam results [Basket III]': 'Basket III',
    'What is your favorite subject? ': 'Favorite Subject',
    'What career are you interested in pursuing? ': 'Career',
    'Did everything go well with your A/L exams? ': 'A/L Status',
    'Which A/L stream did you choose?': 'Stream'
})

In [163]:
import re

# Function to preprocess the description
def preprocess_description(text):
    # Normalize case to lower for consistency
    text = text.lower()

    # Remove special characters except for full stops, preserve intra-word spaces
    text = re.sub(r'[^\w\s\.]', '', text)  # \w matches any alphanumeric character, \s is for spaces, \. is for full stops

    # Remove multiple consecutive spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [164]:
columns_to_preprocess = ['Favorite Subject', 'Career']
for col in columns_to_preprocess:
    data[col] = data[col].astype(str).apply(preprocess_description)

In [165]:
data.head()

Unnamed: 0,Stream,Did everything go well with your A/L exams?,Favorite Subject,Maths,Science,Religion,English,Sinhala or Tamil,History,Basket I,Basket II,Basket III,Career
0,Physical Science,"Mostly, but I faced some challenges",maths,A,A,A,A,B,A,A,C,A,software engineering
1,Physical Science,"Yes, everything went well",maths,A,A,A,A,A,A,A,B,A,software engineer
2,Physical Science,"Mostly, but I faced some challenges",mathematics,A,B,A,A,A,A,A,A,B,engineering
3,Physical Science,"Mostly, but I faced some challenges",ict,B,C,C,A,B,A,A,A,C,software engineer
4,Physical Science,"Mostly, but I faced some challenges",science,A,A,A,A,A,A,B,S,A,it industry


In [166]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 523 entries, 0 to 522
Data columns (total 13 columns):
 #   Column                                       Non-Null Count  Dtype 
---  ------                                       --------------  ----- 
 0   Stream                                       523 non-null    object
 1   Did everything go well with your A/L exams?  523 non-null    object
 2   Favorite Subject                             523 non-null    object
 3   Maths                                        523 non-null    object
 4   Science                                      523 non-null    object
 5   Religion                                     523 non-null    object
 6   English                                      523 non-null    object
 7   Sinhala or Tamil                             523 non-null    object
 8   History                                      523 non-null    object
 9   Basket I                                     523 non-null    object
 10  Basket II     

In [167]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from collections import Counter
import re

# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', ' ', text)
    # Tokenize text
    tokens = text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
    return tokens

# Apply preprocessing to each description
tokenized_favorite_subjets = data['Favorite Subject'].apply(preprocess_text)

# Flatten the list of lists into a single list of tokens
all_tokens = [token for sublist in tokenized_favorite_subjets for token in sublist]

# Count the frequency of each token
token_counts = Counter(all_tokens)

# Most common tokens
most_common_tokens = token_counts.most_common(6500)

most_common_tokens

[('maths', 171),
 ('science', 98),
 ('mathematics', 67),
 ('combined', 61),
 ('ict', 38),
 ('economics', 17),
 ('accounting', 16),
 ('biology', 13),
 ('english', 12),
 ('technology', 11),
 ('business', 10),
 ('econ', 9),
 ('studies', 9),
 ('history', 7),
 ('french', 7),
 ('geography', 7),
 ('art', 5),
 ('agri', 5),
 ('literature', 5),
 ('dancing', 4),
 ('information', 4),
 ('bio', 3),
 ('accounts', 3),
 ('pure', 3),
 ('sinhala', 2),
 ('japanese', 2),
 ('math', 2),
 ('commerce', 2),
 ('political', 2),
 ('oriental', 1),
 ('music', 1),
 ('computer', 1),
 ('applied', 1),
 ('tamil', 1),
 ('account', 1),
 ('german', 1),
 ('economy', 1)]

In [168]:
# Assuming most_common_tokens is a list of tuples like: [('word1', count1), ('word2', count2), ...]
for word, count in most_common_tokens:
    print(word, count)

maths 171
science 98
mathematics 67
combined 61
ict 38
economics 17
accounting 16
biology 13
english 12
technology 11
business 10
econ 9
studies 9
history 7
french 7
geography 7
art 5
agri 5
literature 5
dancing 4
information 4
bio 3
accounts 3
pure 3
sinhala 2
japanese 2
math 2
commerce 2
political 2
oriental 1
music 1
computer 1
applied 1
tamil 1
account 1
german 1
economy 1


In [169]:
# Function to check for the presence of keyword groups
def check_keyword_group_presence(favorite_subject, keyword_group):
    for keyword in keyword_group:
        # Check if the keyword is present in the description
        if keyword in favorite_subject:
            return True
    return False

In [None]:
# List of keyword groups
keywords = {
    'maths': ['maths', 'mathematics', 'combined', 'math', 'pure'],
    'science': ['science', 'bio'],
    'ict': ['ict', 'computer', 'information', 'it'],
    'commerce': ['commerce', 'economics', 'accounting', 'econ', 'business', 'accounts', 'economy', 'buisness', 'account'],
    'english': ['english'],
    'history': ['history'],
    'geography': ['geography'],
    'french': ['french'],
    'agri': ['agri'],
    'dancing': ['dancing'],
    'literature': ['literature'],
    'sinhala': ['sinhala'],
    'art': ['art'],
    'japanese': ['japanese'],
    'music': ['music'],
    'tamil': ['tamil'],
    'german': ['german'],
    'technology': ['technology']
}

In [171]:
# Create a column for each keyword group to mark its presence
for feature_name, keyword_group in keywords.items():
    # The lambda function checks if any of the keywords are present in the description
    data[feature_name] = data['Favorite Subject'].apply(
    lambda desc: check_keyword_group_presence(str(desc), keyword_group)
    )

In [172]:
data.head()

Unnamed: 0,Stream,Did everything go well with your A/L exams?,Favorite Subject,Maths,Science,Religion,English,Sinhala or Tamil,History,Basket I,...,agri,dancing,literature,sinhala,art,japanese,music,tamil,german,technology
0,Physical Science,"Mostly, but I faced some challenges",maths,A,A,A,A,B,A,A,...,False,False,False,False,False,False,False,False,False,False
1,Physical Science,"Yes, everything went well",maths,A,A,A,A,A,A,A,...,False,False,False,False,False,False,False,False,False,False
2,Physical Science,"Mostly, but I faced some challenges",mathematics,A,B,A,A,A,A,A,...,False,False,False,False,False,False,False,False,False,False
3,Physical Science,"Mostly, but I faced some challenges",ict,B,C,C,A,B,A,A,...,False,False,False,False,False,False,False,False,False,False
4,Physical Science,"Mostly, but I faced some challenges",science,A,A,A,A,A,A,B,...,False,False,False,False,False,False,False,False,False,False


In [None]:
# Function to get the first matched keyword category
def get_extracted_subject(favorite_subject):
    for feature_name, keyword_group in keywords.items():
        if check_keyword_group_presence(favorite_subject, keyword_group):
            return feature_name  # Return the matched subject category
    return "unknown"  # If no match, return "unknown"

# Apply function to extract subject category
data['Extracted Subject'] = data['Favorite Subject'].astype(str).apply(get_extracted_subject)

# Drop the original 'Favorite Subject' column and keyword presence columns
data.drop(columns=['Favorite Subject'] + list(keywords.keys()), inplace=True)

# Rename the column if needed
data.rename(columns={'Extracted Subject': 'Favorite Subject'}, inplace=True)

In [174]:
data.head()

Unnamed: 0,Stream,Did everything go well with your A/L exams?,Maths,Science,Religion,English,Sinhala or Tamil,History,Basket I,Basket II,Basket III,Career,Favorite Subject
0,Physical Science,"Mostly, but I faced some challenges",A,A,A,A,B,A,A,C,A,software engineering,maths
1,Physical Science,"Yes, everything went well",A,A,A,A,A,A,A,B,A,software engineer,maths
2,Physical Science,"Mostly, but I faced some challenges",A,B,A,A,A,A,A,A,B,engineering,maths
3,Physical Science,"Mostly, but I faced some challenges",B,C,C,A,B,A,A,A,C,software engineer,ict
4,Physical Science,"Mostly, but I faced some challenges",A,A,A,A,A,A,B,S,A,it industry,science


In [175]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from collections import Counter
import re

# Function to preprocess text (without breaking into words)
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', ' ', text)
    # Remove stopwords from full phrases
    filtered_text = " ".join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
    return filtered_text.strip()

# Apply preprocessing to each career description
processed_careers = data['Career'].apply(preprocess_text)

# Count the frequency of each unique career (keeping full phrases)
career_counts = Counter(processed_careers)

# Get the most common career names
most_common_careers = career_counts.most_common(6500)

most_common_careers  # This will return tuples like ('software engineering', 150), ('fashion design', 120)

[('engineering', 57),
 ('software engineering', 30),
 ('software engineer', 27),
 ('', 24),
 ('teaching', 16),
 ('business', 15),
 ('engineer', 10),
 ('banking', 8),
 ('business analyst', 6),
 ('accountant', 6),
 ('doctor', 5),
 ('lecturer', 5),
 ('banker', 4),
 ('medicine', 4),
 ('aviation', 4),
 ('data science', 4),
 ('finance', 4),
 ('software development', 4),
 ('industry', 3),
 ('data scientist', 3),
 ('food scientist', 3),
 ('maritime', 3),
 ('marketing', 3),
 ('software developer', 3),
 ('entrepreneur', 3),
 ('fashion designer', 3),
 ('information technology', 3),
 ('computer science', 3),
 ('electronic engineering', 3),
 ('data analysis', 3),
 ('data analyst', 3),
 ('chemist', 3),
 ('mechanical engineer', 3),
 ('fashion design', 2),
 ('lecturing', 2),
 ('web development', 2),
 ('buisness', 2),
 ('manager', 2),
 ('robotics', 2),
 ('computer engineering', 2),
 ('se', 2),
 ('sure', 2),
 ('project manager', 2),
 ('s e', 2),
 ('lawyer', 2),
 ('scientist', 2),
 ('civil engineering', 

In [176]:
# Loop through the most common career names and print them
for career, count in most_common_careers:
    print(career, count)

engineering 57
software engineering 30
software engineer 27
 24
teaching 16
business 15
engineer 10
banking 8
business analyst 6
accountant 6
doctor 5
lecturer 5
banker 4
medicine 4
aviation 4
data science 4
finance 4
software development 4
industry 3
data scientist 3
food scientist 3
maritime 3
marketing 3
software developer 3
entrepreneur 3
fashion designer 3
information technology 3
computer science 3
electronic engineering 3
data analysis 3
data analyst 3
chemist 3
mechanical engineer 3
fashion design 2
lecturing 2
web development 2
buisness 2
manager 2
robotics 2
computer engineering 2
se 2
sure 2
project manager 2
s e 2
lawyer 2
scientist 2
civil engineering 2
data engineering 2
cloud engineer 2
project management 2
hotel industry 2
management 2
business management 2
medical field 2
developer 2
software engineer related 1
networking 1
designing 1
business analysis 1
announcer 1
healthcare 1
artificial intelligence 1
food science field 1
sportsman cricketer 1
business manager 1
ev

In [177]:
def check_keyword_group_presence(career, keyword_group):
    """
    Check if any keyword from the keyword group is present in the career string.
    The check is case-insensitive.
    """
    career = career.lower()  # Convert career description to lowercase
    for keyword in keyword_group:
        if keyword.lower() in career:  # Convert keyword to lowercase for case-insensitive matching
            return True
    return False

In [178]:
# List of keyword groups
keywords = {
    'IT/ Software Developer': ['it', 'computer', 's.e.', 'computing', 'ict', 'web', 'programmer', 'software', 'software engineering', 'software engineer', 'data science', 'id develop engineer', 'network enjineer', 'software development', 'data scientist', 'software developer', 'information technology', 'computer science', 'data analysis', 'web development', 'computer engineering', 'se', 's e', 'data engineering', 'data analyst', 'cloud engineer', 'developer', 'software engineer related', 'networking', 'artificial intelligence', 'cyber security analyst', 'game development software development', 'it industry', 'it technology side', 'full stack developing'],
    'Engineer': ['engineering', 'civil engineering', 'electronic engineering', 'mechanical engineer', 'robotics', 'engineer', 'to be a engineer', 'construction'],
    'Designer': ['fashion designer', 'uiux development', 'designing', 'photographergraphic designer', 'fashion design', 'software ui designer', 'ui designing', 'software engineering grapic design', 'uiux designer', 'fashion design or art related career', 'fashion designer'],
    'Lecturer/ Professor': ['maths', 'academia', 'a professor', 'a lecturer or an entrepreneur', 'lecturer', 'lecture', 'lecture engineer', 'university lecture', 'lecturing'],
    'Teacher': ['educator', 'english', 'language specialist', 'teacher', 'pre school teacher', 'teaching', 'teaching tutoring', 'teaching or hygiene officer'],
    'Businessman': ['business', 'ceo', 'buisness', 'businessman', 'my business', 'a businessmen', 'starting my own business', 'having my own business', 'owner of a business', 'entrepreneur', 'enterpreneur'],
    'Business Analysis/ Business Manager/ BIS': ['management', 'management side', 'business analysis', 'business analyst', 'business manager', 'business information systems', 'business management'],
    'Accountant': ['accountant', 'accountancy side', 'accounting', 'acconting'],
    'Medical Industry': ['bio', 'technical', 'doctor', 'doctor medicine', 'medical doctor', 'medical', 'clinician in the medical field', 'medical industry', 'dental surgeon', 'hospitality industry', 'medicine', 'healthcare', 'nursing'],
    'Chemist': ['chemist', 'chemical field'],
    'Banker': ['to be banker', 'banking', 'banker', 'banking side'],
    'Aviation': ['aviation', 'food qualityhygiene control business management and aviation', 'aviation field', 'pilot'],
    'Finance': ['finance', 'financial', 'finance sector', 'financial analyst'],
    'Manager': ['manager', 'managing', 'program manager', 'project management', 'project manager', 'quality or laboratory manager', 'as a manager', 'project manager', 'sports manager'],
    'Lawyer': ['lawyer', 'law'],
    'Hotel Industry': ['hotel industry', 'hotel field', 'hotelier'],
    'Food Scientist': ['food scientist', 'scientist', 'in food science field', 'food technologists', 'food science', 'food industry', 'food technologist', 'food production', 'food quality'],
    'Sportsman': ['being a sportsman cricketer', 'sports'],
    'Tourism': ['tourism', 'tourism and hospitality', 'travel guide'], 
    'Agriculture Industry': ['agriculture', 'agronomist', 'agricultural economist'],
    'Laboratary Side': ['laboratary side', 'laboratory scientists', 'in a laboratory'],
    'Microbiologist' : ['microbiology', 'microbiologist'],
    'HR': ['human resources', 'hr side'],
    'Music Industry': ['music industry'],
    'Photographer': ['photographer'],
    'Other': ['actuary', 'dancing', 'motor mechanic', 'machinist', 'electrician', 'farming', 'sl army', 'tea taster', 'physiologist', 'dairy industry', 'marine biologist', 'supplychain', 'idk', 'non', 'heavy machinery operator', 'ba', 'talented professional', 'i want to be a film director', 'not sfessified', 'automobile', 'surveying', 'air traffic controller', 'yes', 'science', 'industrial', 'event planning', 'marketing', 'academic', 'no idea', 'announcer', 'archaeology', 'statistician', 'judge', 'logistics', 'supply chain management', 'custom officer', 'forex trading', 'not sure', 'huu', 'an author or a librarian', ' ', '']
}

In [179]:
# Create a column for each keyword group to mark its presence
for feature_name, keyword_group in keywords.items():
    # The lambda function checks if any of the keywords are present in the description
    data[feature_name] = data['Career'].apply(
    lambda desc: check_keyword_group_presence(str(desc), keyword_group)
    )

In [180]:
data.head()

Unnamed: 0,Stream,Did everything go well with your A/L exams?,Maths,Science,Religion,English,Sinhala or Tamil,History,Basket I,Basket II,...,Food Scientist,Sportsman,Tourism,Agriculture Industry,Laboratary Side,Microbiologist,HR,Music Industry,Photographer,Other
0,Physical Science,"Mostly, but I faced some challenges",A,A,A,A,B,A,A,C,...,False,False,False,False,False,False,False,False,False,True
1,Physical Science,"Yes, everything went well",A,A,A,A,A,A,A,B,...,False,False,False,False,False,False,False,False,False,True
2,Physical Science,"Mostly, but I faced some challenges",A,B,A,A,A,A,A,A,...,False,False,False,False,False,False,False,False,False,True
3,Physical Science,"Mostly, but I faced some challenges",B,C,C,A,B,A,A,A,...,False,False,False,False,False,False,False,False,False,True
4,Physical Science,"Mostly, but I faced some challenges",A,A,A,A,A,A,B,S,...,False,False,False,False,False,False,False,False,False,True


In [181]:
def get_extracted_career(career):
    for feature_name, keyword_group in keywords.items():
        if check_keyword_group_presence(career, keyword_group):
            return feature_name  # Return the matched category
    return career  # If no match, keep the original value

# Apply function to extract subject category while preserving unmatched values
data['Extracted Career'] = data['Career'].astype(str).apply(get_extracted_career)

# Drop the original 'Career' column and keyword presence columns
data.drop(columns=['Career'] + list(keywords.keys()), inplace=True)

# Rename the column if needed
data.rename(columns={'Extracted Career': 'Career'}, inplace=True)

In [182]:
data.head()

Unnamed: 0,Stream,Did everything go well with your A/L exams?,Maths,Science,Religion,English,Sinhala or Tamil,History,Basket I,Basket II,Basket III,Favorite Subject,Career
0,Physical Science,"Mostly, but I faced some challenges",A,A,A,A,B,A,A,C,A,maths,IT/ Software Developer
1,Physical Science,"Yes, everything went well",A,A,A,A,A,A,A,B,A,maths,IT/ Software Developer
2,Physical Science,"Mostly, but I faced some challenges",A,B,A,A,A,A,A,A,B,maths,Engineer
3,Physical Science,"Mostly, but I faced some challenges",B,C,C,A,B,A,A,A,C,ict,IT/ Software Developer
4,Physical Science,"Mostly, but I faced some challenges",A,A,A,A,A,A,B,S,A,science,IT/ Software Developer


In [207]:
# Save the updated dataset as a CSV file
file_path = "updated_dataset.csv"
data.to_csv(file_path, index=False)

# Provide the download link
file_path

'updated_dataset.csv'