In [3]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

# Load datasets
train_df = pd.read_csv('/kaggle/input/train-csv/train.csv')
test_df = pd.read_csv('/kaggle/input/test-csv/test.csv')

def clean_text_refined(text):
    if pd.isna(text): return ""
    text = text.lower()
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Keep: letters, spaces, !, and emojis (non-ascii)
    cleaned = "".join([char for char in text if 'a' <= char <= 'z' or char == ' ' or char == '!' or ord(char) > 127])
    # Normalize whitespace
    return re.sub(r'\s+', ' ', cleaned).strip()

# Apply to Train and Test
for df in [train_df, test_df]:
    df['Review Text'] = df['Review Text'].fillna('')
    df['Review Title'] = df['Review Title'].fillna('')
    df['combined_text'] = df['Review Title'] + " " + df['Review Text']
    df['cleaned_text'] = df['combined_text'].apply(clean_text_refined)

# 80-20 Split
train_set, val_set = train_test_split(
    train_df, test_size=0.20, random_state=42, stratify=train_df['Rating']
)

print(f"Preprocessed! Train size: {len(train_set)}, Validation size: {len(val_set)}")

Preprocessed! Train size: 4554, Validation size: 1139


In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.utils import class_weight

# Hyperparameters
MAX_WORDS = 10000  # Only keep the top 10k most frequent words
MAX_LEN = 100      # Max number of words per review
EMBEDDING_DIM = 100

# Initialize and fit tokenizer
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(train_set['cleaned_text'])

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(train_set['cleaned_text'])
X_val_seq = tokenizer.texts_to_sequences(val_set['cleaned_text'])

# Pad sequences
X_train_padded = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_val_padded = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post', truncating='post')

# Convert labels to 0-indexed (1-5 -> 0-4) and then to categorical
y_train_indices = train_set['Rating'].values - 1
y_val_indices = val_set['Rating'].values - 1

y_train_cat = to_categorical(y_train_indices, num_classes=5)
y_val_cat = to_categorical(y_val_indices, num_classes=5)

In [5]:
# Calculate weights automatically based on frequency
classes = np.unique(y_train_indices)
weights = class_weight.compute_class_weight(class_weight='balanced', 
                                            classes=classes, 
                                            y=y_train_indices)
class_weights_dict = dict(zip(classes, weights))

print("Calculated Class Weights:", class_weights_dict)

Calculated Class Weights: {np.int64(0): np.float64(0.6369230769230769), np.int64(1): np.float64(7.404878048780488), np.int64(2): np.float64(5.23448275862069), np.int64(3): np.float64(1.8625766871165643), np.int64(4): np.float64(0.3895637296834902)}


In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, SpatialDropout1D

model = Sequential([
    Embedding(MAX_WORDS, EMBEDDING_DIM, input_length=MAX_LEN),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(5, activation='softmax') # 5 output nodes for ratings 1-5
])

model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

model.summary()



In [None]:
EPOCHS = 15
BATCH_SIZE = 32

history = model.fit(
    X_train_padded, y_train_cat,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_val_padded, y_val_cat),
    class_weight=class_weights_dict, # Apply the class weights here
    verbose=1
)

In [None]:
# Predict probabilities for each class
val_predictions_probs = model.predict(X_val_padded)

# Get the index of the highest probability (0-4)
val_predictions_indices = np.argmax(val_predictions_probs, axis=1)

# Convert indices back to original ratings (0-4 -> 1-5)
val_predictions_ratings = val_predictions_indices + 1
val_actual_ratings = y_val_indices + 1

In [None]:
from sklearn.metrics import classification_report, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate Weighted F1-Score
wf1 = f1_score(val_actual_ratings, val_predictions_ratings, average='weighted')
print(f"Validation Weighted F1-Score: {wf1:.4f}")

# Detailed Report
print("\nClassification Report:")
print(classification_report(val_actual_ratings, val_predictions_ratings))

In [None]:
# Plotting the Confusion Matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(val_actual_ratings, val_predictions_ratings)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=[1, 2, 3, 4, 5], 
            yticklabels=[1, 2, 3, 4, 5])
plt.xlabel('Predicted Rating')
plt.ylabel('Actual Rating')
plt.title('Validation Confusion Matrix')
plt.show()

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# 1. Define Callbacks for Fine-Tuning
# EarlyStopping stops training when validation loss stops improving to prevent overfitting
early_stop = EarlyStopping(
    monitor='val_loss', 
    patience=3, 
    restore_best_weights=True
)

# ModelCheckpoint saves the best version of your model during the training process
checkpoint = ModelCheckpoint(
    'best_model.h5', 
    monitor='val_loss', 
    save_best_only=True
)

# 2. Build the Refined Model Architecture
# We're adding a second LSTM layer and adjusting dropout for better feature extraction
model = Sequential([
    Embedding(MAX_WORDS, EMBEDDING_DIM, input_length=MAX_LEN),
    SpatialDropout1D(0.3),
    Bidirectional(LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
    Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3)),
    Dense(128, activation='relu'),
    Dropout(0.4),
    Dense(5, activation='softmax')
])

# Using a slightly lower learning rate (0.0005) for more stable convergence
optimizer = Adam(learning_rate=0.0005)

model.compile(
    loss='categorical_crossentropy', 
    optimizer=optimizer, 
    metrics=['accuracy']
)

# 3. Train the Model with Class Weights and Callbacks
history = model.fit(
    X_train_padded, 
    y_train_cat,
    epochs=20, 
    batch_size=64,
    validation_data=(X_val_padded, y_val_cat),
    class_weight=class_weights_dict,  # Crucial for the 2, 3, 4 ratings
    callbacks=[early_stop, checkpoint],
    verbose=1
)

In [None]:
from sklearn.metrics import classification_report, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Load the best weights saved during training
model.load_weights('best_model.h5')

# 2. Predict on validation data
val_preds_probs = model.predict(X_val_padded)
val_preds_indices = np.argmax(val_preds_probs, axis=1)

# 3. Calculate Weighted F1-Score
val_actual = y_val_indices + 1
val_predicted = val_preds_indices + 1

final_f1 = f1_score(val_actual, val_predicted, average='weighted')
print(f"Fine-Tuned Validation Weighted F1-Score: {final_f1_score:.4f}")

# 4. Detailed Report
print("\nFinal Classification Report:")
print(classification_report(val_actual, val_predicted))

In [None]:
# 1. Preprocess and Tokenize the Test Set
# (Assuming 'test_df' was already cleaned in our earlier preprocessing step)
X_test_seq = tokenizer.texts_to_sequences(test_df['cleaned_text'])
X_test_padded = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')

# 2. Generate Predictions
test_preds_probs = model.predict(X_test_padded)
test_preds_indices = np.argmax(test_preds_probs, axis=1)

# 3. Convert indices back to 1-5 ratings
test_df['Star Rating'] = test_preds_indices + 1

In [None]:
# Create the submission dataframe with only 'id' and 'Star Rating'
submission = test_df[['id', 'Star Rating']]

# Save to CSV
submission.to_csv('predictions.csv', index=False)

print("Success! 'predictions.csv' has been created.")
print(submission.head())

In [6]:
from sklearn.utils import resample
import pandas as pd

# Separate classes
df_1 = train_set[train_set.Rating == 1]
df_2 = train_set[train_set.Rating == 2]
df_3 = train_set[train_set.Rating == 3]
df_4 = train_set[train_set.Rating == 4]
df_5 = train_set[train_set.Rating == 5]

# Upsample minority classes
target_n = 2000
df_2_ups = resample(df_2, replace=True, n_samples=target_n, random_state=42)
df_3_ups = resample(df_3, replace=True, n_samples=target_n, random_state=42)
df_4_ups = resample(df_4, replace=True, n_samples=target_n, random_state=42)

# Combine and shuffle
train_balanced = pd.concat([df_1, df_2_ups, df_3_ups, df_4_ups, df_5])
train_balanced = train_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [7]:
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, classification_report
import numpy as np

# 1. Initialize Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# 2. Custom Dataset Class
class ReviewDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Extract features for this index
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# 3. Tokenize Data (Using max_length 128 for efficiency)
train_encodings = tokenizer(train_balanced['cleaned_text'].tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_set['cleaned_text'].tolist(), truncation=True, padding=True, max_length=128)

# 4. Prepare Datasets (Rating 1-5 mapped to 0-4)
train_dataset = ReviewDataset(train_encodings, (train_balanced['Rating'] - 1).tolist())
val_dataset = ReviewDataset(val_encodings, (val_set['Rating'] - 1).tolist())

# 5. Define Metric Function for Weighted F1
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # This is the exact metric required by your assignment
    weighted_f1 = f1_score(labels, preds, average='weighted')
    return {'weighted_f1': weighted_f1}

# 6. Load Model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)

# 7. Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",      # Changed from eval_strategy for compatibility
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="weighted_f1", # Use F1 to pick the best model
    logging_dir='./logs',
    report_to="none"
)

# 8. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# 9. Train and Evaluate
trainer.train()
eval_results = trainer.evaluate()
print(f"Final Validation Results: {eval_results}")

# 10. Generate Final Predictions for Test Set
test_encodings = tokenizer(test_df['cleaned_text'].tolist(), truncation=True, padding=True, max_length=128)
test_dataset = ReviewDataset(test_encodings)

raw_preds = trainer.predict(test_dataset)
test_preds = np.argmax(raw_preds.predictions, axis=-1) + 1 # Convert 0-4 back to 1-5

# Save submission
test_df['Star Rating'] = test_preds
test_df[['id', 'Star Rating']].to_csv('predictions.csv', index=False)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Weighted F1
1,No log,0.845947,0.708309
2,0.758500,0.84672,0.734698
3,0.758500,0.893501,0.702958




Final Validation Results: {'eval_loss': 0.8467204570770264, 'eval_weighted_f1': 0.7346981859889221, 'eval_runtime': 2.7527, 'eval_samples_per_second': 413.778, 'eval_steps_per_second': 13.078, 'epoch': 3.0}


In [11]:
pd.prediction.head()

AttributeError: module 'pandas' has no attribute 'prediction'

In [18]:
!pip install -U accelerate transformers

