<a href="https://colab.research.google.com/github/syed-mohsin-s/Employee-Sentiment-Analysis/blob/main/Employee_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load datasets
train = pd.read_csv("train_set.csv")
val = pd.read_csv("validation_set.csv")
test = pd.read_csv("test_set.csv")

print("Train shape:", train.shape)
print("Validation shape:", val.shape)
print("Test shape:", test.shape)

print(train.head())
print(train['label'].value_counts())


Train shape: (656, 13)
Validation shape: (116, 13)
Test shape: (225, 10)
      id     person_name                                  nine_box_category  \
0      1        John Doe  Category 1: 'Risk' (Low performance, Low poten...   
1  10045   Douglas Henry  Category 1: 'Risk' (Low performance, Low poten...   
2  10044   Douglas Henry  Category 1: 'Risk' (Low performance, Low poten...   
3  10005  Freddie Davies  Category 1: 'Risk' (Low performance, Low poten...   
4  10004  Freddie Davies  Category 1: 'Risk' (Low performance, Low poten...   

                                            feedback  adjusted  reviewed  \
0  John has not progressed in his position. He is...     False      True   
1  Douglas Henry has been having trouble in all a...     False     False   
2  Douglas has a lot to work on and areas to grow...     False     False   
3  Freddie is a nice guy, but his performance and...     False      True   
4  Freddie has been quite disappointing this quar...     False     False

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load training data
train = pd.read_csv("train_set.csv")
val = pd.read_csv("validation_set.csv")
test = pd.read_csv("test_set.csv")

# Map numeric labels to text if needed
# Update label_map to include all unique labels from the training data
unique_labels = train['label'].unique()
label_map = {label: f"Category_{label}" for label in unique_labels}
train['sentiment'] = train['label'].map(label_map)
val['sentiment'] = val['label'].map(label_map)

# Use feedback_clean if available
X_train = train['feedback_clean'].fillna(train['feedback'])
X_val = val['feedback_clean'].fillna(val['feedback'])
y_train = train['sentiment']
y_val = val['sentiment']

# TF-IDF feature extraction
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

# Train Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Validate
y_pred = model.predict(X_val_tfidf)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

Validation Accuracy: 0.46551724137931033
              precision    recall  f1-score   support

  Category_0       0.52      0.76      0.62        17
  Category_1       0.44      0.31      0.36        13
  Category_2       0.50      0.33      0.40        12
  Category_3       0.48      0.71      0.57        14
  Category_4       0.37      0.44      0.40        16
  Category_5       0.60      0.25      0.35        12
  Category_6       0.00      0.00      0.00         5
  Category_7       0.50      0.18      0.27        11
  Category_8       0.44      0.69      0.54        16

    accuracy                           0.47       116
   macro avg       0.43      0.41      0.39       116
weighted avg       0.46      0.47      0.43       116



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
X_test = test['feedback_clean'].fillna(test['feedback'])
X_test_tfidf = vectorizer.transform(X_test)
test['predicted_sentiment'] = model.predict(X_test_tfidf)

# Save for next tasks
test[['id', 'person_name', 'feedback', 'predicted_sentiment']].to_csv("test_set_with_sentiment.csv", index=False)


In [9]:
# Task 1 – Sentiment Labeling with a Transformer (DistilBERT)
# -----------------------------------------------------------
# This script trains a DistilBERT sequence classification model on your train/validation sets
# and predicts sentiment for the test set. It is robust to both numeric and string labels
# (e.g., 0/1/2 or "Negative"/"Neutral"/"Positive").
#
# Expected columns in train/validation:
#   id, person_name, nine_box_category, feedback, adjusted, reviewed, label,
#   feedback_len, num_of_sent, performance_class, potential_class,
#   feedback_clean, data_type
# Expected columns in test:
#   same as above, but label may be absent.
#
# Outputs:
#   - ./artifacts/
#       ├── model/                          (saved best model + tokenizer)
#       ├── metrics_validation.json         (eval metrics)
#       ├── classification_report.txt       (per-class metrics on validation)
#       ├── label_mapping.json              (id2label/label2id mapping)
#       └── test_set_with_sentiment.csv     (predictions for test)
#
# Usage:
#   - Put this file next to your CSVs, or adjust the file paths below.
#   - Run in a notebook cell or as a script: `python task1_transformer_sentiment.py`
#
# Notes:
#   - If you have a GPU available, training will be much faster. The script auto-detects CUDA.
#   - You can switch to RoBERTa by changing MODEL_NAME to "roberta-base" and tokenizer/model classes accordingly.

import os
import json
import math
import random
from typing import List, Dict

import numpy as np
import pandas as pd

import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    set_seed,
)

# -----------------------------
# Configuration
# -----------------------------
TRAIN_PATH = "train_set.csv"
VAL_PATH   = "validation_set.csv"
TEST_PATH  = "test_set.csv"

TEXT_COL_PRIMARY = "feedback_clean"  # preferred clean text
TEXT_COL_FALLBACK = "feedback"       # fallback if clean text missing/empty
LABEL_COL = "label"
ID_COL = "id"  # used when saving predictions
PERSON_COL = "person_name"  # optional for nicer output

ARTIFACT_DIR = "artifacts"
MODEL_OUT_DIR = os.path.join(ARTIFACT_DIR, "model")
METRICS_PATH = os.path.join(ARTIFACT_DIR, "metrics_validation.json")
REPORT_PATH = os.path.join(ARTIFACT_DIR, "classification_report.txt")
LABELMAP_PATH = os.path.join(ARTIFACT_DIR, "label_mapping.json")
PRED_OUT_PATH = os.path.join(ARTIFACT_DIR, "test_set_with_sentiment.csv")

# Model choice
MODEL_NAME = "distilbert-base-uncased"  # swap to "roberta-base" for RoBERTa
MAX_LENGTH = 160
N_EPOCHS = 3
LR = 2e-5
TRAIN_BS = 16
EVAL_BS = 16
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.06
SEED = 42

os.makedirs(ARTIFACT_DIR, exist_ok=True)
set_seed(SEED)

# -----------------------------
# Helpers
# -----------------------------

def coalesce_text(df: pd.DataFrame, primary: str, fallback: str) -> pd.Series:
    """Return a text series preferring primary, falling back to fallback, and filling NaNs."""
    prim = df[primary] if primary in df.columns else pd.Series([None]*len(df))
    fall = df[fallback] if fallback in df.columns else pd.Series([None]*len(df))
    # Choose primary if non-empty string after strip, else fallback
    out = []
    for a, b in zip(prim.fillna(""), fall.fillna("")):
        at = str(a).strip()
        bt = str(b).strip()
        out.append(at if len(at) > 0 else bt)
    return pd.Series(out)


def normalize_label_series(s: pd.Series) -> (List[int], Dict[int, str], Dict[str, int]):
    """Convert a label series that may be numeric or string into ids 0..K-1.
    Returns: (label_ids_list, id2label, label2id)
    - If labels are numeric but not contiguous from 0, we remap to 0..K-1.
    - If labels are strings, we sort label names for stable mapping.
    """
    if s.dtype.kind in {"i", "u"}:
        # numeric labels
        uniq = sorted(pd.Series(s.unique()).dropna().tolist())
        label2id = {str(lbl): i for i, lbl in enumerate(uniq)}
        id2label = {i: str(lbl) for i, lbl in enumerate(uniq)}
        mapped = [label2id[str(x)] for x in s]
        return mapped, id2label, label2id
    else:
        # string labels
        uniq = sorted(pd.Series(s.astype(str).unique()).dropna().tolist())
        label2id = {lbl: i for i, lbl in enumerate(uniq)}
        id2label = {i: lbl for lbl, i in label2id.items()}
        mapped = [label2id[str(x)] for x in s.astype(str)]
        return mapped, id2label, label2id


class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx])
        return item


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if labels is None: # Handle cases where labels are not available (e.g., test set prediction)
        return {} # Return empty metrics
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro", zero_division=0)
    return {"accuracy": acc, "precision_macro": precision, "recall_macro": recall, "f1_macro": f1}


# -----------------------------
# Load data
# -----------------------------
print("Loading CSVs...")
train_df = pd.read_csv(TRAIN_PATH)
val_df = pd.read_csv(VAL_PATH)
test_df = pd.read_csv(TEST_PATH)

# Build text series
train_texts = coalesce_text(train_df, TEXT_COL_PRIMARY, TEXT_COL_FALLBACK)
val_texts   = coalesce_text(val_df, TEXT_COL_PRIMARY, TEXT_COL_FALLBACK)
test_texts  = coalesce_text(test_df, TEXT_COL_PRIMARY, TEXT_COL_FALLBACK)

# Remove rows with truly empty text (rare but possible)
train_mask = train_texts.str.len() > 0
val_mask = val_texts.str.len() > 0

train_df = train_df.loc[train_mask].reset_index(drop=True)
val_df = val_df.loc[val_mask].reset_index(drop=True)
train_texts = train_texts.loc[train_mask].reset_index(drop=True)
val_texts = val_texts.loc[val_mask].reset_index(drop=True)

# Normalize labels -> ids, and build mappings
if LABEL_COL not in train_df.columns:
    raise ValueError(f"'{LABEL_COL}' column not found in training data.")

train_label_ids, id2label, label2id = normalize_label_series(train_df[LABEL_COL])
val_label_ids, _, _ = normalize_label_series(val_df[LABEL_COL])

# Persist mappings for downstream tasks
with open(LABELMAP_PATH, "w") as f:
    json.dump({"id2label": {str(k): v for k, v in id2label.items()},
               "label2id": label2id}, f, indent=2)

num_labels = len(id2label)
print(f"Detected {num_labels} labels: {id2label}")

# -----------------------------
# Tokenization
# -----------------------------
print("Tokenizing...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

train_enc = tokenizer(
    train_texts.tolist(),
    padding=True,
    truncation=True,
    max_length=MAX_LENGTH,
)
val_enc = tokenizer(
    val_texts.tolist(),
    padding=True,
    truncation=True,
    max_length=MAX_LENGTH,
)
test_enc = tokenizer(
    test_texts.fillna("").tolist(),
    padding=True,
    truncation=True,
    max_length=MAX_LENGTH,
)

train_dataset = TextDataset(train_enc, train_label_ids)
val_dataset = TextDataset(val_enc, val_label_ids)
test_dataset = TextDataset(test_enc, labels=None)

# -----------------------------
# Model & Training
# -----------------------------
print("Loading model...")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label={i: id2label[i] for i in range(num_labels)},
    label2id={id2label[i]: i for i in range(num_labels)},
)

use_cuda = torch.cuda.is_available()
print("CUDA available:", use_cuda)

warmup_steps = None  # we will use warmup_ratio instead

training_args = TrainingArguments(
    output_dir=os.path.join(ARTIFACT_DIR, "hf_runs"),
    eval_strategy="epoch",
    save_strategy="epoch",  # Changed save_strategy to match eval_strategy
    save_total_limit=N_EPOCHS,
    save_steps=math.ceil(len(train_dataset) / TRAIN_BS),
    logging_strategy="steps",
    logging_steps=50,
    learning_rate=LR,
    per_device_train_batch_size=TRAIN_BS,
    per_device_eval_batch_size=EVAL_BS,
    num_train_epochs=N_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    fp16=use_cuda,  # enable mixed precision if GPU present
    seed=SEED,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("Training...")
trainer.train()

print("Evaluating on validation set...")
val_metrics = trainer.evaluate()
with open(METRICS_PATH, "w") as f:
    json.dump(val_metrics, f, indent=2)
print("Validation metrics saved to:", METRICS_PATH)

# Save a human-readable classification report
val_logits = trainer.predict(val_dataset).predictions
val_preds = np.argmax(val_logits, axis=-1)
val_true = np.array(val_label_ids)
report = classification_report(
    val_true,
    val_preds,
    target_names=[id2label[i] for i in range(num_labels)],
    digits=4,
)
with open(REPORT_PATH, "w") as f:
    f.write(report)
print("Classification report saved to:", REPORT_PATH)

# -----------------------------
# Predict on test and save outputs
# -----------------------------
print("Predicting on test set...")
with torch.no_grad():
    test_out = trainer.predict(test_dataset)
    test_logits = test_out.predictions
    test_pred_ids = np.argmax(test_logits, axis=-1)

# Convert to label names
id2label_str = {i: id2label[i] for i in range(num_labels)}
test_labels = [id2label_str[i] for i in test_pred_ids]

# Confidence (softmax)
def softmax(x):
    e = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e / e.sum(axis=-1, keepdims=True)

probs = softmax(test_logits)
conf = probs.max(axis=-1)

# Build output DataFrame
out_df = pd.DataFrame({
    ID_COL: test_df[ID_COL] if ID_COL in test_df.columns else np.arange(len(test_df)),
    "person_name": test_df[PERSON_COL] if PERSON_COL in test_df.columns else None,
    "feedback_text": test_texts,
    "predicted_sentiment": test_labels,
    "pred_confidence": conf,
})

out_df.to_csv(PRED_OUT_PATH, index=False)
print("Saved test predictions to:", PRED_OUT_PATH)

# -----------------------------
# Save the best model & tokenizer
# -----------------------------
print("Saving model and tokenizer...")
trainer.save_model(MODEL_OUT_DIR)
tokenizer.save_pretrained(MODEL_OUT_DIR)
print("Model saved to:", MODEL_OUT_DIR)

print("All done. ✨")

Loading CSVs...
Detected 9 labels: {0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8'}
Tokenizing...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model...
CUDA available: True


  trainer = Trainer(


Training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,No log,2.129731,0.241379,0.098384,0.192519,0.120955
2,2.163100,1.996161,0.37069,0.128056,0.291667,0.176755
3,2.010900,1.92451,0.387931,0.175633,0.30754,0.207682


Evaluating on validation set...


Validation metrics saved to: artifacts/metrics_validation.json
Classification report saved to: artifacts/classification_report.txt
Predicting on test set...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved test predictions to: artifacts/test_set_with_sentiment.csv
Saving model and tokenizer...
Model saved to: artifacts/model
All done. ✨


In [None]:
import os
print(os.listdir("."))


['.config', 'employee_review_mturk_dataset_test_v6_kaggle.csv', 'employee_review_mturk_dataset_v10_kaggle.csv', 'train_set.csv', 'test_set.csv', 'validation_set.csv', 'sample_data']


In [None]:
import os
import json
import math
from typing import List, Dict

import numpy as np
import pandas as pd

import torch
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    set_seed,
)

# ==========================================
# Task 1 – Transformer Sentiment (Clean, Weighted)
# ==========================================
# This version avoids modifying model.forward and instead uses a
# custom Trainer (WeightedTrainer) to apply class-weighted loss.
# Also increases max_length/epochs and includes robust label mapping.

# -----------------------------
# Configuration
# -----------------------------
TRAIN_PATH = "train_set.csv"
VAL_PATH   = "validation_set.csv"
TEST_PATH  = "test_set.csv"

TEXT_COL_PRIMARY = "feedback_clean"   # preferred clean text
TEXT_COL_FALLBACK = "feedback"        # fallback if clean text missing/empty
LABEL_COL = "label"
ID_COL = "id"
PERSON_COL = "person_name"

ARTIFACT_DIR = "artifacts"
MODEL_OUT_DIR = os.path.join(ARTIFACT_DIR, "model")
METRICS_PATH = os.path.join(ARTIFACT_DIR, "metrics_validation.json")
REPORT_PATH = os.path.join(ARTIFACT_DIR, "classification_report.txt")
LABELMAP_PATH = os.path.join(ARTIFACT_DIR, "label_mapping.json")
PRED_OUT_PATH = os.path.join(ARTIFACT_DIR, "test_set_with_sentiment.csv")

# Model choice
MODEL_NAME = "distilbert-base-uncased"   # swap to "roberta-base" to try RoBERTa
MAX_LENGTH = 256
N_EPOCHS = 5
LR = 3e-5
TRAIN_BS = 16
EVAL_BS = 16
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.06
SEED = 42

os.makedirs(ARTIFACT_DIR, exist_ok=True)
set_seed(SEED)

# -----------------------------
# Safety check for common import conflict
# -----------------------------
# Warn if a local file/folder named 'torch' exists
if any(name == "torch.py" or name == "torch" for name in os.listdir(".")):
    print("[WARN] A local file/folder named 'torch' is present in the working directory.\n"
          "       This can shadow the real PyTorch package and cause AttributeError issues.\n"
          "       Rename it and remove __pycache__ if present.")

# -----------------------------
# Helpers
# -----------------------------

def coalesce_text(df: pd.DataFrame, primary: str, fallback: str) -> pd.Series:
    prim = df[primary] if primary in df.columns else pd.Series([None]*len(df))
    fall = df[fallback] if fallback in df.columns else pd.Series([None]*len(df))
    out = []
    for a, b in zip(prim.fillna(""), fall.fillna("")):
        at = str(a).strip()
        bt = str(b).strip()
        out.append(at if len(at) > 0 else bt)
    return pd.Series(out)


def normalize_label_series(s: pd.Series) -> (List[int], Dict[int, str], Dict[str, int]):
    # Convert possibly numeric/string labels into contiguous ids 0..K-1
    if s.dtype.kind in {"i", "u"}:  # integer/unsigned
        uniq = sorted(pd.Series(s.unique()).dropna().tolist())
        label2id = {str(lbl): i for i, lbl in enumerate(uniq)}
        id2label = {i: str(lbl) for i, lbl in enumerate(uniq)}
        mapped = [label2id[str(x)] for x in s]
    else:
        uniq = sorted(pd.Series(s.astype(str).unique()).dropna().tolist())
        label2id = {lbl: i for i, lbl in enumerate(uniq)}
        id2label = {i: lbl for lbl, i in label2id.items()}
        mapped = [label2id[str(x)] for x in s.astype(str)]
    return mapped, id2label, label2id


class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx])
        return item


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro", zero_division=0)
    return {"accuracy": acc, "precision_macro": precision, "recall_macro": recall, "f1_macro": f1}


# -----------------------------
# Load data
# -----------------------------
print("Loading CSVs...")
train_df = pd.read_csv(TRAIN_PATH)
val_df = pd.read_csv(VAL_PATH)
test_df = pd.read_csv(TEST_PATH)

train_texts = coalesce_text(train_df, TEXT_COL_PRIMARY, TEXT_COL_FALLBACK)
val_texts   = coalesce_text(val_df, TEXT_COL_PRIMARY, TEXT_COL_FALLBACK)
test_texts  = coalesce_text(test_df, TEXT_COL_PRIMARY, TEXT_COL_FALLBACK)

# Filter out truly empty
train_mask = train_texts.str.len() > 0
val_mask = val_texts.str.len() > 0
train_df = train_df.loc[train_mask].reset_index(drop=True)
val_df = val_df.loc[val_mask].reset_index(drop=True)
train_texts = train_texts.loc[train_mask].reset_index(drop=True)
val_texts = val_texts.loc[val_mask].reset_index(drop=True)

if LABEL_COL not in train_df.columns:
    raise ValueError(f"'{LABEL_COL}' column not found in training data.")

train_label_ids, id2label, label2id = normalize_label_series(train_df[LABEL_COL])
val_label_ids, _, _ = normalize_label_series(val_df[LABEL_COL])

with open(LABELMAP_PATH, "w") as f:
    json.dump({"id2label": {str(k): v for k, v in id2label.items()},
               "label2id": label2id}, f, indent=2)

num_labels = len(id2label)
print(f"Detected {num_labels} labels: {id2label}")

# -----------------------------
# Class imbalance handling
# -----------------------------
classes = np.unique(train_label_ids)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=train_label_ids)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# -----------------------------
# Tokenization
# -----------------------------
print("Tokenizing...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

train_enc = tokenizer(train_texts.tolist(), padding=True, truncation=True, max_length=MAX_LENGTH)
val_enc = tokenizer(val_texts.tolist(), padding=True, truncation=True, max_length=MAX_LENGTH)
test_enc = tokenizer(test_texts.fillna("").tolist(), padding=True, truncation=True, max_length=MAX_LENGTH)

train_dataset = TextDataset(train_enc, train_label_ids)
val_dataset = TextDataset(val_enc, val_label_ids)
test_dataset = TextDataset(test_enc, labels=None)

# -----------------------------
# Model
# -----------------------------
print("Loading model...")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label={i: id2label[i] for i in range(num_labels)},
    label2id={id2label[i]: i for i in range(num_labels)},
)

# -----------------------------
# Custom Trainer with weighted loss
# -----------------------------
from torch.nn import CrossEntropyLoss

class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # move weights to the right device lazily (Trainer sets device later)
        self._class_weights_cpu = class_weights
        self._class_weights = None

    @property
    def class_weights(self):
        # ensure weights are on the same device as model
        if self._class_weights is None or self._class_weights.device != self.args.device:
            self._class_weights = self._class_weights_cpu.to(self.args.device)
        return self._class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# -----------------------------
# Training setup
# -----------------------------
use_cuda = torch.cuda.is_available()
print("CUDA available:", use_cuda)

training_args = TrainingArguments(
    output_dir=os.path.join(ARTIFACT_DIR, "hf_runs"),
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    logging_strategy="steps",
    logging_steps=50,
    learning_rate=LR,
    per_device_train_batch_size=TRAIN_BS,
    per_device_eval_batch_size=EVAL_BS,
    num_train_epochs=N_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    fp16=use_cuda,
    seed=SEED,
)

trainer = WeightedTrainer(
    class_weights=class_weights,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# -----------------------------
# Train & Evaluate
# -----------------------------
print("Training...")
trainer.train()

print("Evaluating on validation set...")
val_metrics = trainer.evaluate()
with open(METRICS_PATH, "w") as f:
    json.dump(val_metrics, f, indent=2)
print("Validation metrics saved to:", METRICS_PATH)

val_logits = trainer.predict(val_dataset).predictions
val_preds = np.argmax(val_logits, axis=-1)
val_true = np.array(val_label_ids)
report = classification_report(val_true, val_preds, target_names=[id2label[i] for i in range(num_labels)], digits=4)
with open(REPORT_PATH, "w") as f:
    f.write(report)
print("Classification report saved to:", REPORT_PATH)

# -----------------------------
# Predict on test and save outputs
# -----------------------------
print("Predicting on test set...")
with torch.no_grad():
    test_out = trainer.predict(test_dataset)
    test_logits = test_out.predictions
    test_pred_ids = np.argmax(test_logits, axis=-1)

id2label_str = {i: id2label[i] for i in range(num_labels)}
test_labels = [id2label_str[i] for i in test_pred_ids]

def softmax(x):
    e = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e / e.sum(axis=-1, keepdims=True)

probs = softmax(test_logits)
conf = probs.max(axis=-1)

out_df = pd.DataFrame({
    ID_COL: test_df[ID_COL] if ID_COL in test_df.columns else np.arange(len(test_df)),
    "person_name": test_df[PERSON_COL] if PERSON_COL in test_df.columns else None,
    "feedback_text": test_texts,
    "predicted_sentiment": test_labels,
    "pred_confidence": conf,
})

out_df.to_csv(PRED_OUT_PATH, index=False)
print("Saved test predictions to:", PRED_OUT_PATH)

# -----------------------------
# Save model & tokenizer
# -----------------------------
print("Saving model and tokenizer...")
trainer.save_model(MODEL_OUT_DIR)
tokenizer.save_pretrained(MODEL_OUT_DIR)
print("Model saved to:", MODEL_OUT_DIR)

print("All done. ✨")


Loading CSVs...
Detected 9 labels: {0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8'}
Tokenizing...


In [None]:
import torch
import torch.nn as nn
print(nn)   # should print a module, not error


<module 'torch.nn' from '/usr/local/lib/python3.11/dist-packages/torch/nn/__init__.py'>


In [7]:
import os
import json
import math
from typing import List, Dict

import numpy as np
import pandas as pd

import torch
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    set_seed,
)

# ==========================================
# Task 1 – Transformer Sentiment (Clean, Weighted)
# ==========================================
# This version avoids modifying model.forward and instead uses a
# custom Trainer (WeightedTrainer) to apply class-weighted loss.
# Also increases max_length/epochs and includes robust label mapping.

# -----------------------------
# Configuration
# -----------------------------
TRAIN_PATH = "train_set.csv"
VAL_PATH   = "validation_set.csv"
TEST_PATH  = "test_set.csv"

TEXT_COL_PRIMARY = "feedback_clean"   # preferred clean text
TEXT_COL_FALLBACK = "feedback"        # fallback if clean text missing/empty
LABEL_COL = "label"
ID_COL = "id"
PERSON_COL = "person_name"

ARTIFACT_DIR = "artifacts"
MODEL_OUT_DIR = os.path.join(ARTIFACT_DIR, "model")
METRICS_PATH = os.path.join(ARTIFACT_DIR, "metrics_validation.json")
REPORT_PATH = os.path.join(ARTIFACT_DIR, "classification_report.txt")
LABELMAP_PATH = os.path.join(ARTIFACT_DIR, "label_mapping.json")
PRED_OUT_PATH = os.path.join(ARTIFACT_DIR, "test_set_with_sentiment.csv")

# Model choice
MODEL_NAME = "distilbert-base-uncased"   # swap to "roberta-base" to try RoBERTa
MAX_LENGTH = 256
N_EPOCHS = 5
LR = 3e-5
TRAIN_BS = 16
EVAL_BS = 16
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.06
SEED = 42

os.makedirs(ARTIFACT_DIR, exist_ok=True)
set_seed(SEED)

# -----------------------------
# Safety check for common import conflict
# -----------------------------
# Warn if a local file/folder named 'torch' exists
if any(name == "torch.py" or name == "torch" for name in os.listdir(".")):
    print("[WARN] A local file/folder named 'torch' is present in the working directory.\n"
          "       This can shadow the real PyTorch package and cause AttributeError issues.\n"
          "       Rename it and remove __pycache__ if present.")

# -----------------------------
# Helpers
# -----------------------------

def coalesce_text(df: pd.DataFrame, primary: str, fallback: str) -> pd.Series:
    prim = df[primary] if primary in df.columns else pd.Series([None]*len(df))
    fall = df[fallback] if fallback in df.columns else pd.Series([None]*len(df))
    out = []
    for a, b in zip(prim.fillna(""), fall.fillna("")):
        at = str(a).strip()
        bt = str(b).strip()
        out.append(at if len(at) > 0 else bt)
    return pd.Series(out)


def normalize_label_series(s: pd.Series) -> (List[int], Dict[int, str], Dict[str, int]):
    # Convert possibly numeric/string labels into contiguous ids 0..K-1
    if s.dtype.kind in {"i", "u"}:  # integer/unsigned
        uniq = sorted(pd.Series(s.unique()).dropna().tolist())
        label2id = {str(lbl): i for i, lbl in enumerate(uniq)}
        id2label = {i: str(lbl) for i, lbl in enumerate(uniq)}
        mapped = [label2id[str(x)] for x in s]
    else:
        uniq = sorted(pd.Series(s.astype(str).unique()).dropna().tolist())
        label2id = {lbl: i for i, lbl in enumerate(uniq)}
        id2label = {i: lbl for lbl, i in label2id.items()}
        mapped = [label2id[str(x)] for x in s.astype(str)]
    return mapped, id2label, label2id


class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx])
        return item


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro", zero_division=0)
    return {"accuracy": acc, "precision_macro": precision, "recall_macro": recall, "f1_macro": f1}


# -----------------------------
# Load data
# -----------------------------
print("Loading CSVs...")
train_df = pd.read_csv(TRAIN_PATH)
val_df = pd.read_csv(VAL_PATH)
test_df = pd.read_csv(TEST_PATH)

train_texts = coalesce_text(train_df, TEXT_COL_PRIMARY, TEXT_COL_FALLBACK)
val_texts   = coalesce_text(val_df, TEXT_COL_PRIMARY, TEXT_COL_FALLBACK)
test_texts  = coalesce_text(test_df, TEXT_COL_PRIMARY, TEXT_COL_FALLBACK)

# Filter out truly empty
train_mask = train_texts.str.len() > 0
val_mask = val_texts.str.len() > 0
train_df = train_df.loc[train_mask].reset_index(drop=True)
val_df = val_df.loc[val_mask].reset_index(drop=True)
train_texts = train_texts.loc[train_mask].reset_index(drop=True)
val_texts = val_texts.loc[val_mask].reset_index(drop=True)

if LABEL_COL not in train_df.columns:
    raise ValueError(f"'{LABEL_COL}' column not found in training data.")

train_label_ids, id2label, label2id = normalize_label_series(train_df[LABEL_COL])
val_label_ids, _, _ = normalize_label_series(val_df[LABEL_COL])

with open(LABELMAP_PATH, "w") as f:
    json.dump({"id2label": {str(k): v for k, v in id2label.items()},
               "label2id": label2id}, f, indent=2)

num_labels = len(id2label)
print(f"Detected {num_labels} labels: {id2label}")

# -----------------------------
# Class imbalance handling
# -----------------------------
classes = np.unique(train_label_ids)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=train_label_ids)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# -----------------------------
# Tokenization
# -----------------------------
print("Tokenizing...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

train_enc = tokenizer(train_texts.tolist(), padding=True, truncation=True, max_length=MAX_LENGTH)
val_enc = tokenizer(val_texts.tolist(), padding=True, truncation=True, max_length=MAX_LENGTH)
test_enc = tokenizer(test_texts.fillna("").tolist(), padding=True, truncation=True, max_length=MAX_LENGTH)

train_dataset = TextDataset(train_enc, train_label_ids)
val_dataset = TextDataset(val_enc, val_label_ids)
test_dataset = TextDataset(test_enc, labels=None)

# -----------------------------
# Model
# -----------------------------
print("Loading model...")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label={i: id2label[i] for i in range(num_labels)},
    label2id={id2label[i]: i for i in range(num_labels)},
)

# -----------------------------
# Custom Trainer with weighted loss
# -----------------------------
from torch.nn import CrossEntropyLoss

class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # move weights to the right device lazily (Trainer sets device later)
        self._class_weights_cpu = class_weights
        self._class_weights = None

    @property
    def class_weights(self):
        # ensure weights are on the same device as model
        if self._class_weights is None or self._class_weights.device != self.args.device:
            self._class_weights = self._class_weights_cpu.to(self.args.device)
        return self._class_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=0, ignore_index=-100):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=self.class_weights, ignore_index=ignore_index)
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# -----------------------------
# Training setup
# -----------------------------
use_cuda = torch.cuda.is_available()
print("CUDA available:", use_cuda)

training_args = TrainingArguments(
    output_dir=os.path.join(ARTIFACT_DIR, "hf_runs"),
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    logging_strategy="steps",
    logging_steps=50,
    learning_rate=LR,
    per_device_train_batch_size=TRAIN_BS,
    per_device_eval_batch_size=EVAL_BS,
    num_train_epochs=N_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    fp16=use_cuda,
    seed=SEED,
)

trainer = WeightedTrainer(
    class_weights=class_weights,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# -----------------------------
# Train & Evaluate
# -----------------------------
print("Training...")
trainer.train()

print("Evaluating on validation set...")
val_metrics = trainer.evaluate()
with open(METRICS_PATH, "w") as f:
    json.dump(val_metrics, f, indent=2)
print("Validation metrics saved to:", METRICS_PATH)

val_logits = trainer.predict(val_dataset).predictions
val_preds = np.argmax(val_logits, axis=-1)
val_true = np.array(val_label_ids)
report = classification_report(val_true, val_preds, target_names=[id2label[i] for i in range(num_labels)], digits=4)
with open(REPORT_PATH, "w") as f:
    f.write(report)
print("Classification report saved to:", REPORT_PATH)

# -----------------------------
# Predict on test and save outputs
# -----------------------------
print("Predicting on test set...")
with torch.no_grad():
    test_out = trainer.predict(test_dataset)
    test_logits = test_out.predictions
    test_pred_ids = np.argmax(test_logits, axis=-1)

id2label_str = {i: id2label[i] for i in range(num_labels)}
test_labels = [id2label_str[i] for i in test_pred_ids]

def softmax(x):
    e = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e / e.sum(axis=-1, keepdims=True)

probs = softmax(test_logits)
conf = probs.max(axis=-1)

out_df = pd.DataFrame({
    ID_COL: test_df[ID_COL] if ID_COL in test_df.columns else np.arange(len(test_df)),
    "person_name": test_df[PERSON_COL] if PERSON_COL in test_df.columns else None,
    "feedback_text": test_texts,
    "predicted_sentiment": test_labels,
    "pred_confidence": conf,
})

out_df.to_csv(PRED_OUT_PATH, index=False)
print("Saved test predictions to:", PRED_OUT_PATH)

# -----------------------------
# Save model & tokenizer
# -----------------------------
print("Saving model and tokenizer...")
trainer.save_model(MODEL_OUT_DIR)
tokenizer.save_pretrained(MODEL_OUT_DIR)
print("Model saved to:", MODEL_OUT_DIR)

print("All done. ✨")

Loading CSVs...
Detected 9 labels: {0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8'}
Tokenizing...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model...
CUDA available: True


  super().__init__(*args, **kwargs)


Training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,No log,2.158556,0.181034,0.092637,0.174587,0.114343
2,2.186400,1.904523,0.362069,0.221558,0.292253,0.21012
3,1.951300,1.748572,0.396552,0.302408,0.353257,0.316883
4,1.656600,1.632641,0.5,0.400522,0.449431,0.413893
5,1.456200,1.615832,0.482759,0.452933,0.437322,0.41969


Evaluating on validation set...


Validation metrics saved to: artifacts/metrics_validation.json
Classification report saved to: artifacts/classification_report.txt
Predicting on test set...


Saved test predictions to: artifacts/test_set_with_sentiment.csv
Saving model and tokenizer...
Model saved to: artifacts/model
All done. ✨


In [13]:
# Task 2 – Exploratory Data Analysis (EDA)
# ========================================
# This script performs EDA on the employee sentiment dataset.
# Input: train_set.csv, validation_set.csv, test_set_with_sentiment.csv
# Output: visualizations (saved to artifacts/eda/) + printed insights

import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# -----------------------------
# Setup
# -----------------------------
ARTIFACT_DIR = "artifacts/eda"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

TRAIN_PATH = "train_set.csv"
VAL_PATH   = "validation_set.csv"
TEST_PATH  = "test_set_with_sentiment.csv"

# -----------------------------
# Load Data
# -----------------------------
train = pd.read_csv(TRAIN_PATH)
val = pd.read_csv(VAL_PATH)
test = pd.read_csv(TEST_PATH)

print("Train shape:", train.shape)
print("Validation shape:", val.shape)
print("Test shape:", test.shape)

print("\nTrain head:")
print(train.head())

# -----------------------------
# Missing values check
# -----------------------------
print("\nMissing values (train):")
print(train.isnull().sum())

print("\nMissing values (test):")
print(test.isnull().sum())

# -----------------------------
# Sentiment Distribution
# -----------------------------
plt.figure(figsize=(6,4))
sns.countplot(x=train['label'])
plt.title("Training Set Sentiment Distribution")
plt.savefig(os.path.join(ARTIFACT_DIR, "train_sentiment_distribution.png"))
plt.close()

plt.figure(figsize=(6,4))
sns.countplot(x=test['predicted_sentiment'])
plt.title("Test Set Predicted Sentiment Distribution")
plt.savefig(os.path.join(ARTIFACT_DIR, "test_sentiment_distribution.png"))
plt.close()

# -----------------------------
# Sentiment Over Time (if date column exists)
# -----------------------------
# Check if 'reviewed' column exists and is suitable for datetime conversion in the test set
if 'reviewed' in test.columns and test['reviewed'].dtype != 'object':
    try:
        test['reviewed'] = pd.to_datetime(test['reviewed'])
        monthly_sentiment = test.groupby([test['reviewed'].dt.to_period("M"), 'predicted_sentiment']).size().unstack().fillna(0)

        if not monthly_sentiment.empty:
            monthly_sentiment.plot(kind='line', marker='o', figsize=(10,6))
            plt.title("Monthly Sentiment Trends")
            plt.ylabel("Message Count")
            plt.xlabel("Month")
            plt.savefig(os.path.join(ARTIFACT_DIR, "monthly_sentiment_trends.png"))
            plt.close()
        else:
            print("\nNo valid date data in 'reviewed' column of test set for time series plot.")
    except Exception as e:
        print(f"\nCould not create monthly sentiment plot: {e}")


# -----------------------------
# Employee-wise Analysis
# -----------------------------
if 'person_name' in test.columns:
    employee_sentiment = test.groupby(['person_name','predicted_sentiment']).size().unstack().fillna(0)

    # Top 10 employees by message count
    top10 = employee_sentiment.sum(axis=1).sort_values(ascending=False).head(10)
    if not top10.empty:
        plt.figure(figsize=(10,6))
        top10.plot(kind='bar')
        plt.title("Top 10 Employees by Number of Messages")
        plt.ylabel("Message Count")
        plt.savefig(os.path.join(ARTIFACT_DIR, "top10_employees.png"))
        plt.close()
    else:
        print("\nNo data for employee-wise analysis.")


# -----------------------------
# Wordclouds
# -----------------------------
if 'feedback_text' in test.columns:
    # Get all unique predicted sentiment labels from the test set
    sentiment_labels = test['predicted_sentiment'].unique()

    # Generate a wordcloud for each sentiment label
    for label in sentiment_labels:
        text_for_label = " ".join(test[test['predicted_sentiment']==label]["feedback_text"].dropna())

        if text_for_label: # Only generate wordcloud if there is text for the label
            plt.figure(figsize=(8, 4))
            wordcloud = WordCloud(width=800, height=400).generate(text_for_label)
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.axis("off")
            plt.title(f"WordCloud for Sentiment: {label}")
            plt.savefig(os.path.join(ARTIFACT_DIR, f"wordcloud_sentiment_{label}.png"))
            plt.close()
        else:
            print(f"\nNo text found for sentiment label: {label}. Skipping wordcloud generation.")


# -----------------------------
# Summary Printouts
# -----------------------------
print("\n=== EDA Insights ===")
print("Training set sentiment distribution:")
print(train['label'].value_counts(normalize=True))

print("Test set predicted sentiment distribution:")
print(test['predicted_sentiment'].value_counts(normalize=True))

if 'person_name' in test.columns and 'top10' in locals():
    print("Top employees by number of messages:")
    print(top10)

Train shape: (656, 13)
Validation shape: (116, 13)
Test shape: (225, 5)

Train head:
      id     person_name                                  nine_box_category  \
0      1        John Doe  Category 1: 'Risk' (Low performance, Low poten...   
1  10045   Douglas Henry  Category 1: 'Risk' (Low performance, Low poten...   
2  10044   Douglas Henry  Category 1: 'Risk' (Low performance, Low poten...   
3  10005  Freddie Davies  Category 1: 'Risk' (Low performance, Low poten...   
4  10004  Freddie Davies  Category 1: 'Risk' (Low performance, Low poten...   

                                            feedback  adjusted  reviewed  \
0  John has not progressed in his position. He is...     False      True   
1  Douglas Henry has been having trouble in all a...     False     False   
2  Douglas has a lot to work on and areas to grow...     False     False   
3  Freddie is a nice guy, but his performance and...     False      True   
4  Freddie has been quite disappointing this quar...     Fal

In [27]:

# Task 3 – Employee Score Calculation (no dates available)
import pandas as pd
import os

ARTIFACT_DIR = "artifacts/task3"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

# Load labeled dataset
test = pd.read_csv("test_set_with_sentiment.csv")

# Map sentiments to numeric scores
sentiment_map = {"Positive": 1, "Negative": -1, "Neutral": 0}
test['sentiment_score'] = test['predicted_sentiment'].map(sentiment_map)

# Aggregate scores by employee (no monthly grouping possible)
employee_scores = (
    test.groupby('person_name')['sentiment_score']
    .sum()
    .reset_index()
    .sort_values(by='sentiment_score', ascending=False)
)

# Save output
employee_scores.to_csv(os.path.join(ARTIFACT_DIR, "employee_scores.csv"), index=False)

print("Employee sentiment scores saved to artifacts/task3/employee_scores.csv")
print(employee_scores.head(10))


Employee sentiment scores saved to artifacts/task3/employee_scores.csv
         person_name  sentiment_score
0        Alisa Stark              0.0
1        Allan Logan              0.0
2          Amy Jones              0.0
3       Andrew Grant              0.0
4  Angelica Peterson              0.0
5      Archie Dawson              0.0
6     Aryanna Carney              0.0
7        Ashton Owen              0.0
8      Aubri Hartman              0.0
9        Aydin Pitts              0.0


In [24]:
# Task 4 – Employee Ranking
import pandas as pd
import os

ARTIFACT_DIR = "artifacts/task4"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

# Try loading monthly scores first, else fall back to overall scores
monthly_path = "artifacts/task3/employee_monthly_scores.csv"
overall_path = "artifacts/task3/employee_scores.csv"

if os.path.exists(monthly_path):
    print("Using monthly scores...")
    df = pd.read_csv(monthly_path)

    # Add ranking per month
    df['rank'] = df.groupby('year_month')['sentiment_score'].rank(
        method="dense", ascending=False
    ).astype(int)

    out_path = os.path.join(ARTIFACT_DIR, "employee_monthly_ranking.csv")
    df.to_csv(out_path, index=False)
    print(f"Monthly ranking saved to {out_path}")
    print(df.head(10))

elif os.path.exists(overall_path):
    print("Using overall scores...")
    df = pd.read_csv(overall_path)

    # Add global rank
    df['rank'] = df['sentiment_score'].rank(
        method="dense", ascending=False
    ).astype(int)

    out_path = os.path.join(ARTIFACT_DIR, "employee_ranking.csv")
    df.to_csv(out_path, index=False)
    print(f"Overall ranking saved to {out_path}")
    print(df.head(10))

else:
    raise FileNotFoundError("Neither employee_monthly_scores.csv nor employee_scores.csv found. Please run Task 3 first.")


Using monthly scores...
Monthly ranking saved to artifacts/task4/employee_monthly_ranking.csv
Empty DataFrame
Columns: [person_name, year_month, sentiment_score, rank]
Index: []


In [25]:
# Task 5 – Flight Risk Identification
import pandas as pd
import os

ARTIFACT_DIR = "artifacts/task5"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

monthly_path = "artifacts/task4/employee_monthly_ranking.csv"
overall_path = "artifacts/task4/employee_ranking.csv"

def classify_risk(score, rank, rank_percentile):
    if score < 0 or rank_percentile >= 0.8:
        return "High Risk"
    elif score == 0 or (0.4 <= rank_percentile < 0.8):
        return "Medium Risk"
    else:
        return "Low Risk"

if os.path.exists(monthly_path):
    print("Using monthly ranking for risk analysis...")
    df = pd.read_csv(monthly_path)

    # Compute average score and average rank per employee
    summary = df.groupby('person_name').agg(
        avg_score=('sentiment_score','mean'),
        avg_rank=('rank','mean')
    ).reset_index()

    # Normalize ranks into percentile
    summary['rank_percentile'] = summary['avg_rank'] / summary['avg_rank'].max()

    # Assign risk
    summary['flight_risk'] = summary.apply(
        lambda row: classify_risk(row['avg_score'], row['avg_rank'], row['rank_percentile']), axis=1
    )

    out_path = os.path.join(ARTIFACT_DIR, "employee_flight_risk.csv")
    summary.to_csv(out_path, index=False)
    print(f"Flight risk file saved to {out_path}")
    print(summary.head(10))

elif os.path.exists(overall_path):
    print("Using overall ranking for risk analysis...")
    df = pd.read_csv(overall_path)

    df['rank_percentile'] = df['rank'] / df['rank'].max()
    df['flight_risk'] = df.apply(
        lambda row: classify_risk(row['sentiment_score'], row['rank'], row['rank_percentile']), axis=1
    )

    out_path = os.path.join(ARTIFACT_DIR, "employee_flight_risk.csv")
    df.to_csv(out_path, index=False)
    print(f"Flight risk file saved to {out_path}")
    print(df.head(10))

else:
    raise FileNotFoundError("No ranking data found from Task 4. Please run Task 4 first.")


Using monthly ranking for risk analysis...
Flight risk file saved to artifacts/task5/employee_flight_risk.csv
Empty DataFrame
Columns: [person_name, avg_score, avg_rank, rank_percentile, flight_risk]
Index: []


In [30]:
# Task 6 – Streamlit Dashboard
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

st.set_page_config(page_title="Employee Sentiment Dashboard", layout="wide")

# -----------------------------
# Load Data
# -----------------------------
sentiment_path = "test_set_with_sentiment.csv"
score_path = "artifacts/task3/employee_scores.csv"
monthly_score_path = "artifacts/task3/employee_monthly_scores.csv"
ranking_path = "artifacts/task4/employee_ranking.csv"
monthly_ranking_path = "artifacts/task4/employee_monthly_ranking.csv"
risk_path = "artifacts/task5/employee_flight_risk.csv"

@st.cache_data
def load_data(path):
    if os.path.exists(path):
        return pd.read_csv(path)
    return None

test = load_data(sentiment_path)
scores = load_data(score_path)
monthly_scores = load_data(monthly_score_path)
ranking = load_data(ranking_path)
monthly_ranking = load_data(monthly_ranking_path)
risk = load_data(risk_path)

st.title("📊 Employee Sentiment & Flight Risk Dashboard")

# -----------------------------
# Sentiment Distribution
# -----------------------------
st.header("1. Sentiment Distribution")
if test is not None:
    fig, ax = plt.subplots(figsize=(6,4))
    sns.countplot(x=test['predicted_sentiment'], ax=ax)
    ax.set_title("Predicted Sentiment Distribution")
    st.pyplot(fig)

# -----------------------------
# Sentiment Trends Over Time
# -----------------------------
st.header("2. Sentiment Trends Over Time")
if monthly_scores is not None and "year_month" in monthly_scores.columns:
    trend = monthly_scores.groupby(['year_month'])['sentiment_score'].sum().reset_index()
    fig, ax = plt.subplots(figsize=(8,4))
    sns.lineplot(data=trend, x="year_month", y="sentiment_score", marker="o", ax=ax)
    ax.set_title("Monthly Sentiment Trends")
    st.pyplot(fig)

# -----------------------------
# Employee Scores
# -----------------------------
st.header("3. Employee Sentiment Scores")
if scores is not None:
    st.dataframe(scores)
    fig, ax = plt.subplots(figsize=(8,4))
    top_scores = scores.sort_values(by="sentiment_score", ascending=False).head(10)
    sns.barplot(data=top_scores, x="sentiment_score", y="person_name", ax=ax)
    ax.set_title("Top 10 Employees by Sentiment Score")
    st.pyplot(fig)

# -----------------------------
# Employee Ranking
# -----------------------------
st.header("4. Employee Rankings")
if ranking is not None:
    st.dataframe(ranking.sort_values(by="rank"))
elif monthly_ranking is not None:
    st.dataframe(monthly_ranking.head(20))

# -----------------------------
# Flight Risk
# -----------------------------
st.header("5. Flight Risk Identification")
if risk is not None:
    st.dataframe(risk)
    fig, ax = plt.subplots(figsize=(6,4))
    sns.countplot(x=risk['flight_risk'], ax=ax, order=["Low Risk", "Medium Risk", "High Risk"])
    ax.set_title("Distribution of Flight Risk Levels")
    st.pyplot(fig)


2025-08-19 17:24:47.296 No runtime found, using MemoryCacheStorageManager
