In [1]:
import re
import string
import torch
import pandas as pd
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
)
import os
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_path = "distil_bert_model_data"
model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model.to(device)
model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


# 1. Data Loading and Preprocessing

In [3]:
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    columns_to_keep = ['review_text','job_category', 'date']
    df = df[columns_to_keep].dropna(subset=['review_text', 'job_category', 'date'])

    def clean_text(text):
        text = str(text).lower()
        text = re.sub(r'http\S+|www\S+', '', text)
        text = re.sub(r'[{}]'.format(re.escape(string.punctuation)), '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    df['cleaned_review'] = df['review_text'].apply(clean_text)
    return df

In [4]:
file_path = 'glassdoor_datasets/ai_engineer_jobs.csv'
df = load_and_preprocess_data(file_path)

In [5]:
df.head()

Unnamed: 0,review_text,job_category,date,cleaned_review
0,"Cool tech, difficult industry. Interesting tec...",ai engineer,2019,cool tech difficult industry interesting techn...
1,"Cool tech, difficult industry. Interesting tec...",ai engineer,2019,cool tech difficult industry interesting techn...
2,Right place for software developers Good place...,ai engineer,2021,right place for software developers good place...
3,Excellent Good culture Nice people High salary...,ai engineer,2023,excellent good culture nice people high salary...
4,Good Good management system for beginners. No ...,ai engineer,2021,good good management system for beginners no d...


# 2. Execute Sentiment Analysis

In [9]:
texts = df['cleaned_review'].tolist()

batch_size = 64
all_preds = []

from tqdm.auto import tqdm

for i in tqdm(range(0, len(texts), batch_size), desc="Batch Inference"):
    batch_texts = texts[i:i+batch_size]
    inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
    batch_preds = outputs.logits.argmax(dim=-1).cpu().numpy()
    all_preds.extend(batch_preds)

sentiment_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
df['predicted_sentiment'] = [sentiment_map[p] for p in all_preds]

Batch Inference: 100%|██████████| 4/4 [00:01<00:00,  3.32it/s]


# 3. Results By Year

In [11]:
aggregation = {}

grouped = df.groupby(['date', 'job_category'])
for (year, job_cat), group in grouped:
    total_reviews = len(group)
    sentiment_counts = group['predicted_sentiment'].value_counts().to_dict()
    majority_sentiment = max(sentiment_counts, key=sentiment_counts.get)

    if str(year) not in aggregation:
        aggregation[str(year)] = []
    aggregation[str(year)].append({
        "job_category": job_cat,
        "majority_sentiment": majority_sentiment,
        "num_reviews": total_reviews,
        "sentiment_counts": sentiment_counts
    })

# 4. Save data to json

In [12]:
base_name = os.path.splitext(file_path)[0]
output_file = base_name + ".json"
with open(output_file, "w") as f:
    json.dump(aggregation, f, indent=4)
print(f"Aggregated data saved to {output_file}")

print(json.dumps(aggregation, indent=4))

Aggregated data saved to glassdoor_datasets/ai_engineer_jobs.json
{
    "2008": [
        {
            "job_category": "ai engineer",
            "majority_sentiment": "Positive",
            "num_reviews": 2,
            "sentiment_counts": {
                "Positive": 1,
                "Neutral": 1
            }
        }
    ],
    "2016": [
        {
            "job_category": "ai engineer",
            "majority_sentiment": "Positive",
            "num_reviews": 2,
            "sentiment_counts": {
                "Positive": 2
            }
        }
    ],
    "2017": [
        {
            "job_category": "ai engineer",
            "majority_sentiment": "Positive",
            "num_reviews": 2,
            "sentiment_counts": {
                "Positive": 2
            }
        }
    ],
    "2018": [
        {
            "job_category": "ai engineer",
            "majority_sentiment": "Positive",
            "num_reviews": 7,
            "sentiment_counts": {
           