In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import re
import numpy as np

# --- 1. Data Loading and Cleaning ---

# Define column names based on the file structure
TRAIN_COLUMNS = ['ID', 'TITLE', 'GENRE', 'DESCRIPTION']
TEST_COLUMNS = ['ID', 'TITLE', 'DESCRIPTION']
SKIP_COLUMNS = ['ID', 'TITLE']

# Function to safely load and parse the raw data files with ' ::: ' delimiter
def load_and_clean_data(file_path, cols):
    try:
        # Read the file line by line and split on ' ::: '
        with open(file_path, 'r', encoding='utf-8') as f:
            # We strip each line before splitting to ensure clean parsing
            data = [line.strip().split(' ::: ') for line in f if line.strip()]

        # Filter for lines with the correct number of columns
        if len(data) > 0 and len(data[0]) == len(cols):
            df = pd.DataFrame(data, columns=cols)
        else:
            print(f"Warning: Data in {file_path} did not match expected column count ({len(cols)}).")
            return pd.DataFrame(columns=cols)

        # Remove metadata columns not used for prediction
        df = df.drop(columns=SKIP_COLUMNS, errors='ignore')
        return df

    except FileNotFoundError:
        print(f"Error: File not found at {file_path}. Please check your file names.")
        return pd.DataFrame(columns=cols)
    except Exception as e:
        print(f"An error occurred while loading {file_path}: {e}")
        return pd.DataFrame(columns=cols)


# --- Load Datasets ---
train_df = load_and_clean_data('train_data.txt', TRAIN_COLUMNS)
test_df = load_and_clean_data('test_data.txt', TEST_COLUMNS)
solution_df = load_and_clean_data('test_data_solution.txt', TRAIN_COLUMNS)


# Check if dataframes were loaded successfully
if train_df.empty or test_df.empty or solution_df.empty:
    print("Aborting analysis due to missing or improperly loaded data files.")
    exit()


# --- 2. Text Preprocessing Function ---
def clean_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""
    # Convert to lowercase
    text = text.lower()
    # Remove numbers and special characters (keeping only letters and spaces)
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply text cleaning to plot summaries
X_train_raw = train_df['DESCRIPTION'].apply(clean_text)
X_test_raw = test_df['DESCRIPTION'].apply(clean_text)

# Extract target labels
y_train_raw = train_df['GENRE']
y_true = solution_df['GENRE']


# --- 3. Label Encoding (for target variable Y) ---
# Convert string labels (genres) into numerical form for the classifier
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_raw)
y_true_encoded = label_encoder.transform(y_true)


# --- 4. Feature Engineering (TF-IDF) ---
# Fit the TfidfVectorizer only on the training data to prevent data leakage.
# Common configuration: max_features for dimensionality reduction, sublinear_tf for scaling.
tfidf_vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=25000, # Using a reasonable number of top words
    sublinear_tf=True,
    ngram_range=(1, 2) # Include bigrams for better context
)

print("--- Data Processing ---")
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_raw)
X_test_tfidf = tfidf_vectorizer.transform(X_test_raw)

print(f"Training Samples: {X_train_tfidf.shape[0]}, Features: {X_train_tfidf.shape[1]}")
print(f"Test Samples: {X_test_tfidf.shape[0]}")
print("-" * 35)


# --- 5. Model Training (Logistic Regression) ---
# Logistic Regression is fast and effective for this type of high-dimensional sparse data.
# Using 'saga' solver for better handling of multinomial loss on sparse data.
model = LogisticRegression(
    solver='saga',
    max_iter=1000,
    C=5.0, # Regularization strength (higher C = less regularization)
    random_state=42,
    n_jobs=-1
)

print("Training Logistic Regression Model...")
model.fit(X_train_tfidf, y_train)
print("Training Complete.")
print("-" * 35)


# --- 6. Prediction and Evaluation ---
y_pred_encoded = model.predict(X_test_tfidf)
y_pred_labels = label_encoder.inverse_transform(y_pred_encoded)

accuracy = accuracy_score(y_true_encoded, y_pred_encoded)

print("--- Model Evaluation ---")
print(f"Accuracy on Test Solution Data: {accuracy:.4f}")
print("\nClassification Report:\n")

# Generate and print the classification report
report = classification_report(
    y_true_encoded,
    y_pred_encoded,
    target_names=label_encoder.classes_,
    zero_division=0
)
print(report)

# Optional: Save predictions to a file
prediction_df = pd.DataFrame({
    'Predicted_Genre': y_pred_labels,
    'True_Genre': y_true
})
# prediction_df.to_csv('predictions.csv', index=False)
# print("\nPredictions saved to predictions.csv")

--- Data Processing ---
Training Samples: 54214, Features: 25000
Test Samples: 54200
-----------------------------------
Training Logistic Regression Model...
Training Complete.
-----------------------------------
--- Model Evaluation ---
Accuracy on Test Solution Data: 0.5950

Classification Report:

              precision    recall  f1-score   support

      action       0.47      0.35      0.40      1314
       adult       0.67      0.37      0.48       590
   adventure       0.55      0.22      0.32       775
   animation       0.47      0.13      0.20       498
   biography       0.00      0.00      0.00       264
      comedy       0.55      0.60      0.57      7446
       crime       0.25      0.06      0.09       505
 documentary       0.70      0.83      0.76     13096
       drama       0.56      0.74      0.64     13612
      family       0.45      0.14      0.21       783
     fantasy       0.43      0.10      0.16       322
   game-show       0.88      0.59      0.70     