**TASK 2**

In [1]:
pip install pandas scikit-learn joblib



In [5]:
import pandas as pd

# Load the Telco Churn Dataset
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Drop the 'customerID' column as it's not useful for modeling
df = df.drop('customerID', axis=1)

# Convert 'TotalCharges' column to a numeric type.
# pd.to_numeric with errors='coerce' will turn non-numeric values into NaN.
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Drop rows with missing values, which now include the ' ' entries from 'TotalCharges'
df = df.dropna()

# Convert the 'Churn' target variable to a binary integer (0 or 1)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Separate features (X) and the target variable (y)
X = df.drop('Churn', axis=1)
y = df['Churn']

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import joblib

# Separate categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(exclude=['object']).columns

# Create the preprocessing pipelines for numerical and categorical data
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create the full pipeline with a classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression(max_iter=1000, random_state=42))])

# Define a parameter grid for GridSearchCV, including parameters for both classifiers
param_grid = [
    {
        'classifier': [LogisticRegression(max_iter=1000, random_state=42)],
        'classifier__C': [0.1, 1, 10]
    },
    {
        'classifier': [RandomForestClassifier(random_state=42)],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [10, 20, None]
    }
]

# Fix the initial classifier to be LogisticRegression for the grid search setup
pipeline_for_grid = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('classifier', LogisticRegression(max_iter=1000, random_state=42))])


# Create the GridSearchCV object
grid_search = GridSearchCV(pipeline_for_grid, param_grid, cv=5, verbose=2, n_jobs=-1, scoring='accuracy')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: {:.2f}".format(grid_search.best_score_))

# Get the best model from the grid search
best_pipeline = grid_search.best_estimator_

# Evaluate the best model on the test data
test_accuracy = best_pipeline.score(X_test, y_test)
print("Test set accuracy: {:.2f}".format(test_accuracy))

# Export the complete pipeline using joblib
joblib.dump(best_pipeline, 'telco_churn_pipeline.joblib')

print("Pipeline exported to telco_churn_pipeline.joblib")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters found:  {'classifier': LogisticRegression(max_iter=1000, random_state=42), 'classifier__C': 10}
Best cross-validation accuracy: 0.81
Test set accuracy: 0.79
Pipeline exported to telco_churn_pipeline.joblib


**TASK 5**

In [1]:
pip install pandas scikit-learn joblib transformers datasets scikit-learn pandas torch requests --upgrade

Collecting pandas
  Downloading pandas-2.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting transformers
  Downloading transformers-4.55.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting requests
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Downloading pandas-2.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scikit_learn-1.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.5 MB)
[2K   [90

In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import joblib

# Step 1: Load the dataset
try:
    df = pd.read_csv('customer_support_tickets.csv')
except FileNotFoundError:
    print("Dataset not found. Using dummy data for testing.")
    data = {
        'Ticket Description': [
            "My laptop won't turn on, it seems like a hardware failure.",
            "I was charged twice for my subscription this month.",
            "How do I reset my password for the account?",
            "The software crashes every time I open a file.",
            "I want to cancel my subscription.",
            "Need details about the premium plan.",
            "Billing error: incorrect amount deducted.",
            "App not installing on my device.",
            "Need refund for a faulty product.",
            "Network connection keeps dropping."
        ],
        'Ticket Type': [
            'Technical issue',
            'Billing inquiry',
            'Account access',
            'Technical issue',
            'Cancellation request',
            'Product inquiry',
            'Billing inquiry',
            'Technical issue',
            'Refund request',
            'Technical issue'
        ]
    }
    df = pd.DataFrame(data)

# Step 2: Preprocess the dataset
text_column = 'Ticket Description'
label_column = 'Ticket Type'

# Get unique categories (tags)
categories = df[label_column].unique().tolist()
print("Available Categories/Tags:", categories)

# Map labels to integers for classification
label_to_id = {label: idx for idx, label in enumerate(categories)}
id_to_label = {idx: label for label, idx in label_to_id.items()}
df['label'] = df[label_column].map(label_to_id)

# Reduce dataset size for faster processing
df = df.sample(500, random_state=42) if len(df) > 500 else df  # Max 500 rows

# Split into train and test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[[text_column, 'label']])
test_dataset = Dataset.from_pandas(test_df[[text_column, 'label']])

# Step 3: Zero-shot classification
print("Running Zero-Shot Classification...")
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def zero_shot_predict(text):
    result = zero_shot_classifier(text, candidate_labels=categories, multi_label=False)
    top_tags = sorted(zip(result['labels'], result['scores']), key=lambda x: x[1], reverse=True)[:3]
    predicted_label = result['labels'][0]
    return predicted_label, top_tags

# Apply to test set
test_df['zero_shot_pred'], test_df['zero_shot_top3'] = zip(*test_df[text_column].apply(zero_shot_predict))
test_df['zero_shot_label_id'] = test_df['zero_shot_pred'].map(label_to_id)

# Evaluate zero-shot
zero_shot_accuracy = accuracy_score(test_df['label'], test_df['zero_shot_label_id'])
zero_shot_f1 = f1_score(test_df['label'], test_df['zero_shot_label_id'], average='weighted')
print("\nZero-Shot Accuracy:", zero_shot_accuracy)
print("Zero-Shot F1-Score:", zero_shot_f1)

# Step 4: Fine-tuning with LLM
print("Starting Fine-Tuning...")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples[text_column], padding="max_length", truncation=True)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(categories))

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Changed from evaluation_strategy to eval_strategy
    save_strategy="epoch", # Set save_strategy to epoch to match eval_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=4,  # Small batch size for CPU
    per_device_eval_batch_size=4,
    num_train_epochs=2,  # Reduced epochs for speed
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate fine-tuned model
eval_results = trainer.evaluate()
print("\nFine-Tuned Accuracy:", eval_results['eval_accuracy'])
print("Fine-Tuned F1-Score:", eval_results['eval_f1'])

# Step 5: Output top 3 probable tags
def fine_tuned_predict(text):
    inputs = tokenizer(text, return_tensors="pt")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    model.to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1).squeeze().cpu().numpy()
    top_indices = np.argsort(probs)[-3:][::-1]
    top_tags = [(id_to_label[idx], probs[idx]) for idx in top_indices]
    return top_tags

# Apply to a sample ticket
sample_ticket = test_df[text_column].iloc[0]
print("\nSample Ticket:", sample_ticket)
print("Fine-Tuned Top 3 Tags:", fine_tuned_predict(sample_ticket))

# Step 6: Compare performances
print("\nPerformance Comparison:")
print(f"Zero-Shot: Accuracy={zero_shot_accuracy:.4f}, F1={zero_shot_f1:.4f}")
print(f"Fine-Tuned: Accuracy={eval_results['eval_accuracy']:.4f}, F1={eval_results['eval_f1']:.4f}")

# Step 7: Save the models
trainer.save_model("./fine_tuned_ticket_tagger")
tokenizer.save_pretrained("./fine_tuned_ticket_tagger")
joblib.dump(zero_shot_classifier, 'zero_shot_tagger.joblib')
print("Models saved successfully.")

Dataset not found. Using dummy data for testing.
Available Categories/Tags: ['Technical issue', 'Billing inquiry', 'Account access', 'Cancellation request', 'Product inquiry', 'Refund request']
Running Zero-Shot Classification...


Device set to use cuda:0



Zero-Shot Accuracy: 1.0
Zero-Shot F1-Score: 1.0
Starting Fine-Tuning...


Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.78148,0.0,0.0
2,No log,1.792933,0.0,0.0



Fine-Tuned Accuracy: 0.0
Fine-Tuned F1-Score: 0.0

Sample Ticket: Need refund for a faulty product.
Fine-Tuned Top 3 Tags: [('Technical issue', np.float32(0.18319988)), ('Billing inquiry', np.float32(0.17627631)), ('Refund request', np.float32(0.17027633))]

Performance Comparison:
Zero-Shot: Accuracy=1.0000, F1=1.0000
Fine-Tuned: Accuracy=0.0000, F1=0.0000
Models saved successfully.
