In [1]:
import pandas as pd
import re
import string
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

In [2]:
# Download VADER lexicon if not already downloaded
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\KomPhone\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
df = pd.read_csv("datasets/glassdoor_jobs.csv")

# Convert date column to year
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.dropna(subset=['date'])
df['date'] = df['date'].dt.year

# Clean rating column: ensure ratings are numeric and drop rows with invalid ratings
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
df = df.dropna(subset=['rating'])


  df = pd.read_csv("datasets/glassdoor_jobs.csv")


In [4]:
df.head(5)

Unnamed: 0,rating,title,status,pros,cons,advice,Recommend,CEO Approval,Business Outlook,Career Opportunities,Compensation and Benefits,Senior Management,Work/Life Balance,Culture & Values,Diversity & Inclusion,firm_link,date,job,index
0,5.0,Good,"Current Employee, more than 10 years",Knowledge gain of complete project,Financial growth and personal growth,,v,o,v,3.0,3.0,3.0,3.0,3.0,3.0,Reviews/Baja-Steel-and-Fence-Reviews-E5462645.htm,2022,Manager Design,
1,4.0,Good,"Former Employee, less than 1 year","Good work,good work , flexible, support","Good,work, flexible,good support, good team work",,v,o,o,4.0,4.0,4.0,4.0,4.0,4.0,Reviews/Baja-Steel-and-Fence-Reviews-E5462645.htm,2022,Anonymous Employee,
2,4.0,"Supervising the manufacturing the processes, e...","Current Employee, more than 1 year",This company is a best opportunity for me to l...,"Monthly Target work,Maintain production schedu...",,v,o,v,2.0,3.0,2.0,2.0,2.0,2.0,Reviews/Baja-Steel-and-Fence-Reviews-E5462645.htm,2021,Production Engineer,
3,1.0,terrible,"Current Employee, more than 1 year",I wish there were some to list,too many to list here,,x,x,x,1.0,3.0,1.0,3.0,1.0,,https://www.glassdoor.com/Reviews/Calgary-Flam...,2020,Senior Account Executive,
4,4.0,"It could be so good, but it isn’t","Current Employee, more than 3 years",Fast Paced. Endless challenges. Inclusive envi...,The biggest perk of the job provides no value ...,,o,o,o,3.0,3.0,3.0,1.0,4.0,5.0,https://www.glassdoor.com/Reviews/Calgary-Flam...,2023,Assistant Manager,


In [5]:
df['review_text'] = (
    df['title'].fillna('') + ' ' +
    df['pros'].fillna('') + ' ' +
    df['cons'].fillna('')
)

In [6]:
df.head(5)

Unnamed: 0,rating,title,status,pros,cons,advice,Recommend,CEO Approval,Business Outlook,Career Opportunities,Compensation and Benefits,Senior Management,Work/Life Balance,Culture & Values,Diversity & Inclusion,firm_link,date,job,index,review_text
0,5.0,Good,"Current Employee, more than 10 years",Knowledge gain of complete project,Financial growth and personal growth,,v,o,v,3.0,3.0,3.0,3.0,3.0,3.0,Reviews/Baja-Steel-and-Fence-Reviews-E5462645.htm,2022,Manager Design,,Good Knowledge gain of complete project Finan...
1,4.0,Good,"Former Employee, less than 1 year","Good work,good work , flexible, support","Good,work, flexible,good support, good team work",,v,o,o,4.0,4.0,4.0,4.0,4.0,4.0,Reviews/Baja-Steel-and-Fence-Reviews-E5462645.htm,2022,Anonymous Employee,,"Good Good work,good work , flexible, support G..."
2,4.0,"Supervising the manufacturing the processes, e...","Current Employee, more than 1 year",This company is a best opportunity for me to l...,"Monthly Target work,Maintain production schedu...",,v,o,v,2.0,3.0,2.0,2.0,2.0,2.0,Reviews/Baja-Steel-and-Fence-Reviews-E5462645.htm,2021,Production Engineer,,"Supervising the manufacturing the processes, e..."
3,1.0,terrible,"Current Employee, more than 1 year",I wish there were some to list,too many to list here,,x,x,x,1.0,3.0,1.0,3.0,1.0,,https://www.glassdoor.com/Reviews/Calgary-Flam...,2020,Senior Account Executive,,terrible I wish there were some to list too ma...
4,4.0,"It could be so good, but it isn’t","Current Employee, more than 3 years",Fast Paced. Endless challenges. Inclusive envi...,The biggest perk of the job provides no value ...,,o,o,o,3.0,3.0,3.0,1.0,4.0,5.0,https://www.glassdoor.com/Reviews/Calgary-Flam...,2023,Assistant Manager,,"It could be so good, but it isn’t Fast Paced. ..."


In [7]:
JOB_MAPPINGS = {
    'software engineer': [
        'software developer', 'software engineer', 'devops engineer', 'backend developer',
        'front-end developer', 'full-stack developer', 'application developer',
        'system software engineer', 'mobile developer', 'embedded systems engineer',
        'game developer', 'tools developer'
    ],
    'data scientist': [
        'data scientist', 'data engineer', 'machine learning engineer', 'data analyst',
        'data architect', 'business intelligence analyst', 'big data engineer',
        'statistician', 'research scientist', 'predictive modeler'
    ],
    'cybersecurity specialist': [
        'cybersecurity analyst', 'cybersecurity engineer', 'information security analyst',
        'network security engineer', 'security software engineer', 'penetration tester',
        'ethical hacker', 'security consultant', 'security operations center analyst',
        'incident responder', 'threat hunter', 'forensic analyst'
    ],
    'it support': [
        'it support specialist', 'helpdesk technician', 'technical support engineer',
        'desktop support technician', 'service desk analyst', 'system support specialist'
    ],
    'network engineer': [
        'network administrator', 'network engineer', 'system administrator',
        'network operations specialist', 'wireless engineer', 'telecommunications engineer',
        'voip engineer'
    ],
    'cloud engineer': [
        'cloud engineer', 'cloud architect', 'cloud consultant', 'cloud systems engineer',
        'site reliability engineer', 'devops cloud engineer', 'cloud migration specialist'
    ],
    'qa engineer': [
        'qa engineer', 'quality assurance engineer', 'software tester',
        'automation tester', 'manual tester', 'test analyst', 'performance tester'
    ],
    'ux designer': [
        'ui designer', 'ux designer', 'web developer', 'ui/ux designer', 'interaction designer',
        'product designer', 'visual designer', 'graphic designer'
    ],
    'it manager': [
        'it manager', 'it project manager', 'technical manager', 'program manager',
        'product manager', 'technology operations manager', 'technical program manager'
    ],
    'ai engineer': [
        'ai engineer', 'artificial intelligence engineer', 'machine learning engineer', "ai research scientist", "ai software engineer", "computer vision engineer", "natural language processing Engineer", "ai ethicist"
    ],
    'database administrator': [
        'database administrator', 'dba', 'sql administrator', 'database engineer', 'oracle administrator'
    ],
    'software architect': [
        'software architect', 'solution architect', 'technical architect'
    ],
}

In [8]:
def categorize_job(row):
    title = str(row['title']).lower()
    job_desc = str(row.get('job', '')).lower()  # in case the 'job' column exists
    for category, keywords in JOB_MAPPINGS.items():
        # Create regex pattern with word boundaries for exact matching
        pattern = r'\b(?:{})\b'.format('|'.join(map(re.escape, keywords)))
        if re.search(pattern, title) or re.search(pattern, job_desc):
            return category
    return 'Other'

In [9]:
df['job_category'] = df.apply(categorize_job, axis=1)
it_jobs = df[df['job_category'] != 'Other']
print(it_jobs.head(10))


it_jobs = it_jobs[it_jobs['review_text'].str.strip().astype(bool)]

     rating                                             title  \
51      5.0  Be ready to work hard, but really learn and grow   
68      4.0                             Freelancing is for me   
147     2.0                 Not all change is for the better.   
200     4.0                                       Good people   
243     1.0                                   DON'T WORK HERE   
282     4.0                    Overall a very good experience   
289     5.0                                      Good company   
299     4.0                                     Great Company   
326     5.0                             Nice work environment   
342     4.0                               Positive experience   

                                  status  \
51    Former Employee, more than 3 years   
68                       Former Employee   
147                     Current Employee   
200   Former Employee, more than 5 years   
243  Current Employee, more than 3 years   
282    Former Employ

In [10]:
it_jobs

Unnamed: 0,rating,title,status,pros,cons,advice,Recommend,CEO Approval,Business Outlook,Career Opportunities,...,Senior Management,Work/Life Balance,Culture & Values,Diversity & Inclusion,firm_link,date,job,index,review_text,job_category
51,5.0,"Be ready to work hard, but really learn and grow","Former Employee, more than 3 years",This is a company where you are truly challeng...,"As with any startup, you'll be juggling a lot ...",,v,o,v,5.0,...,4.0,4.0,5.0,4.0,https://www.glassdoor.com/Reviews/I-Will-Teach...,2021,Senior Product Designer,,"Be ready to work hard, but really learn and gr...",ux designer
68,4.0,Freelancing is for me,Former Employee,i want to earn money,finding of client that pays,,o,o,o,,...,,,,,https://www.glassdoor.com/Reviews/I-Will-Teach...,2022,Freelance Graphic Designer,,Freelancing is for me i want to earn money fin...,ux designer
147,2.0,Not all change is for the better.,Current Employee,Flexible for parents with children. Pay is goo...,Exec Mgmt shifted to younger group and they're...,,x,r,o,1,...,2.0,5.0,,,https://www.glassdoor.com/Reviews/California-C...,2011,Marketing Program Manager,,Not all change is for the better. Flexible for...,it manager
200,4.0,Good people,"Former Employee, more than 5 years",Great people and schedule. Get to go new places,"Management changes and so do objectives, not a...",,v,v,r,3,...,3.0,4.0,4.0,,https://www.glassdoor.com/Reviews/California-C...,2013,Marketing Program Manager,,Good people Great people and schedule. Get to...,it manager
243,1.0,DON'T WORK HERE,"Current Employee, more than 3 years","Low pay, poor treatment, hostility abounds.",Brutal management and an backward IT group. N...,,x,r,x,2,...,2,2,1.0,,https://www.glassdoor.com/Reviews/Canada-Life-...,2019,Senior Software Engineer,,"DON'T WORK HERE Low pay, poor treatment, hosti...",software engineer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9901498,2.0,Senior Program Manager,"Former Employee, more than 1 year",Great benefits and good people.,Lack of direction from upper management.,,o,o,o,3,...,2,5,5.0,,https://www.glassdoor.com/Reviews/CDM-Smith-Re...,2016,Senior Program Manager,,Senior Program Manager Great benefits and good...,it manager
9901579,4.0,Good company to work with,"Current Employee, more than 8 years",Good company to work with,Only night shifts & no bonus or proper hikes,,o,o,o,,...,,,,,https://www.glassdoor.com/Reviews/CDM-Smith-Re...,2021,IT Service Desk Analyst,,Good company to work with Good company to work...,it support
9901584,4.0,Good,Current Employee,Working in CDM SMITH is Good,"No good salaries,No good salaries",,o,o,o,,...,,,,,https://www.glassdoor.com/Reviews/CDM-Smith-Re...,2021,Software Engineer,,Good Working in CDM SMITH is Good No good sala...,software engineer
9901677,4.0,Great place to work,"Former Employee, more than 1 year",The Zippo manufacturing company is a great pla...,"The location of Bradford, PA is a little out o...",,o,o,o,,...,,,,,https://www.glassdoor.com/Reviews/Zippo-Review...,2021,Graphic Designer,,Great place to work The Zippo manufacturing co...,ux designer


In [11]:
sia = SentimentIntensityAnalyzer()

In [12]:
def get_sentiment_combined(rating, text, weight_rating=0.5, weight_sia=0.5):
    """
    For reviews with rating 1 or 5, use those directly:
      - 1 => Negative
      - 5 => Positive
    For ratings 2, 3, and 4, combine the numeric rating and VADER's compound score.
    
    Process for ratings 2,3,4:
      1. Normalize the rating: map 2 -> 0, 3 -> 0.5, 4 -> 1.
      2. Compute VADER compound score and normalize to [0,1] via: normalized_sia = (compound + 1)/2.
      3. Compute a weighted average of the normalized rating and normalized VADER score.
      4. Assign label based on combined score:
         - If combined_score < 0.33: "Negative"
         - If combined_score > 0.66: "Positive"
         - Otherwise: "Neutral"
    """
    if rating == 1:
        return "Negative"
    elif rating == 5:
        return "Positive"
    else:
        # Normalize the rating from 2 to 4 into [0, 1]
        normalized_rating = (rating - 2) / 2.0  # 2->0, 3->0.5, 4->1
        # Get VADER compound score and normalize it to [0,1]
        compound = sia.polarity_scores(text)['compound']
        normalized_sia = (compound + 1) / 2.0
        # Compute weighted average of the two signals
        combined_score = weight_rating * normalized_rating + weight_sia * normalized_sia
        
        # Assign sentiment based on thresholds (adjust these thresholds as needed)
        if combined_score < 0.33:
            return "Negative"
        elif combined_score > 0.66:
            return "Positive"
        else:
            return "Neutral"

In [13]:
it_jobs['sentiment'] = it_jobs.apply(lambda row: get_sentiment_combined(row['rating'], row['review_text']), axis=1)

In [14]:
# 6. Finalizing the Dataset
final_columns = ['rating', 'review_text', 'job_category', 'sentiment', "date"]
it_jobs = it_jobs[final_columns]

In [15]:
print(f"\nData Validation:")
print(f"Total IT job reviews: {len(it_jobs)}")
print("Sentiment distribution:")
print(it_jobs['sentiment'].value_counts(normalize=True))
print("\nJob category distribution:")
print(it_jobs['job_category'].value_counts().head(10))


Data Validation:
Total IT job reviews: 612102
Sentiment distribution:
sentiment
Positive    0.759128
Neutral     0.139705
Negative    0.101166
Name: proportion, dtype: float64

Job category distribution:
job_category
software engineer           365582
it manager                   85782
data scientist               59485
ux designer                  20264
qa engineer                  19711
it support                   18149
network engineer             15627
software architect           11007
database administrator        7120
cybersecurity specialist      4864
Name: count, dtype: int64


In [16]:
it_jobs

Unnamed: 0,rating,review_text,job_category,sentiment,date
51,5.0,"Be ready to work hard, but really learn and gr...",ux designer,Positive,2021
68,4.0,Freelancing is for me i want to earn money fin...,ux designer,Positive,2022
147,2.0,Not all change is for the better. Flexible for...,it manager,Neutral,2011
200,4.0,Good people Great people and schedule. Get to...,it manager,Positive,2013
243,1.0,"DON'T WORK HERE Low pay, poor treatment, hosti...",software engineer,Negative,2019
...,...,...,...,...,...
9901498,2.0,Senior Program Manager Great benefits and good...,it manager,Neutral,2016
9901579,4.0,Good company to work with Good company to work...,it support,Positive,2021
9901584,4.0,Good Working in CDM SMITH is Good No good sala...,software engineer,Positive,2021
9901677,4.0,Great place to work The Zippo manufacturing co...,ux designer,Positive,2021


In [17]:
for category in JOB_MAPPINGS:
    it_jobs.to_csv('all_it_jobs3.csv', index=False)

In [18]:
for category in JOB_MAPPINGS:    
    category_df = it_jobs[it_jobs['job_category'] == category]
    category_df.to_csv(f'{category.lower().replace(" ", "_")}_jobs.csv', index=False)