In [None]:
import pandas as pd
import numpy as np

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

print("Train dataset shape:", train_df.shape)
print("Test dataset shape:", test_df.shape)
print("Sample submission shape:", sample_submission.shape)

print("\nTrain dataset head:")
print(train_df.head())

print("\nTest dataset head:")
print(test_df.head())

print("\nSample submission:")
print(sample_submission)

Train dataset shape: (10000, 3)
Test dataset shape: (4020, 2)
Sample submission shape: (5, 2)

Train dataset head:
            ID                                               Text  \
0  train_00001  Its "duet" feature allows users to film a vide...   
1  train_00002  *@**%%@ To support this, Blizzard released the...   
2  train_00003  James Mitchell, the Premier of Western Austral...   
3  train_00004  Pharo has an implementation of a heap in the C...   
4  train_00005  a for Tour, Alberto his his possible allowed m...   

             Subject  
0        Pop Culture  
1             Gaming  
2            History  
3  Computer Sciences  
4             Sports  

Test dataset head:
          ID                                               Text
0  test_0001  Square's decision to produce games %*@*%@@ exc...
1  test_0002  Many of the properties in the Phase are set af...
2  test_0003  As of at least 2015, Apple has removed legacy ...
3  test_0004  Roman coins and medieval artefacts have al

In [None]:
# Explore the data further
print("Subject distribution in training data:")
print(train_df['Subject'].value_counts())

print("\nUnique subjects:")
subjects = train_df['Subject'].unique()
print(subjects)
print("Number of unique subjects:", len(subjects))

# Check for missing values
print("\nMissing values in train data:")
print(train_df.isnull().sum())

print("\nMissing values in test data:")
print(test_df.isnull().sum())

# Look at text length distribution
print("\nText length statistics:")
train_df['text_length'] = train_df['Text'].str.len()
print(train_df['text_length'].describe())

# Sample text for each subject
print("\nSample texts by subject:")
for subject in subjects:
    print(f"\n{subject}:")
    sample_text = train_df[train_df['Subject'] == subject]['Text'].iloc[0]
    print(sample_text[:200] + "..." if len(sample_text) > 200 else sample_text)

Subject distribution in training data:
Subject
Sports               2210
Gaming               1640
Pop Culture          1566
Geography            1413
Natural Sciences     1389
Computer Sciences    1039
History               743
Name: count, dtype: int64

Unique subjects:
['Pop Culture' 'Gaming' 'History' 'Computer Sciences' 'Sports'
 'Natural Sciences' 'Geography']
Number of unique subjects: 7

Missing values in train data:
ID         0
Text       0
Subject    0
dtype: int64

Missing values in test data:
ID      0
Text    0
dtype: int64

Text length statistics:
count    10000.00000
mean       221.88560
std        288.31965
min         11.00000
25%        106.75000
50%        156.00000
75%        229.00000
max       5993.00000
Name: text_length, dtype: float64

Sample texts by subject:

Pop Culture:
Its "duet" feature allows users to film a video aside another video

Gaming:
*@**%%@ To support this, Blizzard released the hero reference kit before release, providing official colors and 

In [None]:
# Check some text examples to understand the data quality
print("Looking for potential data issues (special characters, corrupted text):")

# Check for special characters or corrupted text patterns
import re

def check_text_quality(text):
    # Count special character patterns that might indicate corruption
    special_patterns = ['%*@*%', '@**%%@', '###$$$%', '$%@$$@$', '#%$#%**']
    corruption_count = sum(len(re.findall(re.escape(pattern), text)) for pattern in special_patterns)
    return corruption_count

train_df['corruption_score'] = train_df['Text'].apply(check_text_quality)
test_df['corruption_score'] = test_df['Text'].apply(check_text_quality)

print("Training data corruption stats:")
print(f"Texts with potential corruption: {(train_df['corruption_score'] > 0).sum()}")
print(f"Total corruption patterns found: {train_df['corruption_score'].sum()}")

print("\nTest data corruption stats:")
print(f"Texts with potential corruption: {(test_df['corruption_score'] > 0).sum()}")
print(f"Total corruption patterns found: {test_df['corruption_score'].sum()}")

# Look at some examples with high corruption
print("\nExamples of potentially corrupted text:")
corrupted_examples = train_df[train_df['corruption_score'] > 0].head(3)
for idx, row in corrupted_examples.iterrows():
    print(f"\nSubject: {row['Subject']}")
    print(f"Text: {row['Text'][:300]}...")

Looking for potential data issues (special characters, corrupted text):
Training data corruption stats:
Texts with potential corruption: 5
Total corruption patterns found: 5

Test data corruption stats:
Texts with potential corruption: 2
Total corruption patterns found: 2

Examples of potentially corrupted text:

Subject: Gaming
Text: *@**%%@ To support this, Blizzard released the hero reference kit before release, providing official colors and costume and weapon designs for all 21 heroes present at the game's launch...

Subject: Computer Sciences
Text: Pharo has an implementation of a heap in the Collections-Sequenceable package along with a set of test cases. A heap is used ###$$$% in the implementation of the timer event loop....

Subject: Geography
Text: The members of the Folsom $%@$$@$ party kept a journal- based on the information it reported, a party of Montana residents organized the Washburn–Langford–Doane Expedition in 1870...


In [None]:
import pandas as pd
import numpy as np

# Load the provided data files
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sample_sub = pd.read_csv('sample_submission.csv')

# Display basic information about the datasets
print("Training dataset shape:", train_df.shape)
print("Test dataset shape:", test_df.shape)
print("Sample submission shape:", sample_sub.shape)

print("\nTraining data columns:", train_df.columns.tolist())
print("Test data columns:", test_df.columns.tolist())
print("Sample submission columns:", sample_sub.columns.tolist())

print("\nFirst few rows of training data:")
print(train_df.head())

print("\nFirst few rows of test data:")
print(test_df.head())

print("\nSample submission format:")
print(sample_sub.head())

Training dataset shape: (10000, 3)
Test dataset shape: (4020, 2)
Sample submission shape: (5, 2)

Training data columns: ['ID', 'Text', 'Subject']
Test data columns: ['ID', 'Text']
Sample submission columns: ['ID', 'Subject']

First few rows of training data:
            ID                                               Text  \
0  train_00001  Its "duet" feature allows users to film a vide...   
1  train_00002  *@**%%@ To support this, Blizzard released the...   
2  train_00003  James Mitchell, the Premier of Western Austral...   
3  train_00004  Pharo has an implementation of a heap in the C...   
4  train_00005  a for Tour, Alberto his his possible allowed m...   

             Subject  
0        Pop Culture  
1             Gaming  
2            History  
3  Computer Sciences  
4             Sports  

First few rows of test data:
          ID                                               Text
0  test_0001  Square's decision to produce games %*@*%@@ exc...
1  test_0002  Many of the pro

In [None]:
# Explore the target classes and their distribution
print("Unique subjects in training data:")
print(train_df['Subject'].unique())

print("\nSubject distribution:")
subject_counts = train_df['Subject'].value_counts()
print(subject_counts)

print("\nSubject distribution percentages:")
print(train_df['Subject'].value_counts(normalize=True) * 100)

# Check for missing values
print("\nMissing values in training data:")
print(train_df.isnull().sum())

print("\nMissing values in test data:")
print(test_df.isnull().sum())

# Check text lengths
print("\nText length statistics:")
train_df['text_length'] = train_df['Text'].str.len()
print(train_df['text_length'].describe())

# Show some examples from each subject
print("\nSample texts from each subject:")
for subject in train_df['Subject'].unique():
    print(f"\n--- {subject} ---")
    sample_texts = train_df[train_df['Subject'] == subject]['Text'].head(2).tolist()
    for i, text in enumerate(sample_texts):
        print(f"Example {i+1}: {text[:150]}...")

Unique subjects in training data:
['Pop Culture' 'Gaming' 'History' 'Computer Sciences' 'Sports'
 'Natural Sciences' 'Geography']

Subject distribution:
Subject
Sports               2210
Gaming               1640
Pop Culture          1566
Geography            1413
Natural Sciences     1389
Computer Sciences    1039
History               743
Name: count, dtype: int64

Subject distribution percentages:
Subject
Sports               22.10
Gaming               16.40
Pop Culture          15.66
Geography            14.13
Natural Sciences     13.89
Computer Sciences    10.39
History               7.43
Name: proportion, dtype: float64

Missing values in training data:
ID         0
Text       0
Subject    0
dtype: int64

Missing values in test data:
ID      0
Text    0
dtype: int64

Text length statistics:
count    10000.00000
mean       221.88560
std        288.31965
min         11.00000
25%        106.75000
50%        156.00000
75%        229.00000
max       5993.00000
Name: text_length, dtype

In [None]:
# Import required libraries for text preprocessing and modeling
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import LabelEncoder
import nltk

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

print("Libraries imported successfully")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Libraries imported successfully


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Text preprocessing function
def preprocess_text(text):
    """
    Preprocess text by:
    1. Converting to lowercase
    2. Removing special characters but keeping some punctuation
    3. Removing extra whitespace
    4. Handling contractions
    """
    if pd.isna(text):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove special characters that are not alphanumeric or basic punctuation
    # Keep basic punctuation that might be meaningful
    text = re.sub(r'[^\w\s\.\,\!\?\-\'\"]', ' ', text)

    # Handle contractions (basic ones)
    contractions = {
        "don't": "do not",
        "won't": "will not",
        "can't": "cannot",
        "n't": " not",
        "'re": " are",
        "'ve": " have",
        "'ll": " will",
        "'d": " would",
        "'m": " am"
    }

    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text

# Apply preprocessing to both training and test data
print("Preprocessing training data...")
train_df['processed_text'] = train_df['Text'].apply(preprocess_text)

print("Preprocessing test data...")
test_df['processed_text'] = test_df['Text'].apply(preprocess_text)

# Show before and after examples
print("\nBefore and after preprocessing examples:")
for i in range(3):
    print(f"\nExample {i+1}:")
    print(f"Original: {train_df['Text'].iloc[i][:100]}...")
    print(f"Processed: {train_df['processed_text'].iloc[i][:100]}...")

print("Preprocessing completed!")

Preprocessing training data...
Preprocessing test data...

Before and after preprocessing examples:

Example 1:
Original: Its "duet" feature allows users to film a video aside another video...
Processed: its "duet" feature allows users to film a video aside another video...

Example 2:
Original: *@**%%@ To support this, Blizzard released the hero reference kit before release, providing official...
Processed: to support this, blizzard released the hero reference kit before release, providing official colors ...

Example 3:
Original: James Mitchell, the Premier of Western Australia lent his strong support to renewal of the military ...
Processed: james mitchell, the premier of western australia lent his strong support to renewal of the military ...
Preprocessing completed!


In [None]:
# Prepare the data for modeling
X = train_df['processed_text']
y = train_df['Subject']

# Split the training data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(test_df)}")

# Create different vectorizers to experiment with
vectorizers = {
    'tfidf_basic': TfidfVectorizer(max_features=10000, stop_words='english'),
    'tfidf_ngram': TfidfVectorizer(max_features=15000, ngram_range=(1, 2), stop_words='english', max_df=0.95, min_df=2),
    'tfidf_advanced': TfidfVectorizer(max_features=20000, ngram_range=(1, 3), stop_words='english',
                                      max_df=0.9, min_df=3, sublinear_tf=True),
    'count_basic': CountVectorizer(max_features=10000, stop_words='english'),
    'count_ngram': CountVectorizer(max_features=15000, ngram_range=(1, 2), stop_words='english', max_df=0.95, min_df=2)
}

# Try different models
models = {
    'logistic_regression': LogisticRegression(max_iter=1000, random_state=42),
    'naive_bayes': MultinomialNB(),
    'random_forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'svm': SVC(kernel='linear', random_state=42)
}

print("Vectorizers and models prepared!")
print(f"Vectorizers: {list(vectorizers.keys())}")
print(f"Models: {list(models.keys())}")

Training set size: 8000
Validation set size: 2000
Test set size: 4020
Vectorizers and models prepared!
Vectorizers: ['tfidf_basic', 'tfidf_ngram', 'tfidf_advanced', 'count_basic', 'count_ngram']
Models: ['logistic_regression', 'naive_bayes', 'random_forest', 'svm']


In [None]:
# Function to evaluate model combinations
def evaluate_model(vectorizer_name, model_name, X_train, X_val, y_train, y_val, vectorizers, models):
    print(f"\nEvaluating {vectorizer_name} + {model_name}...")

    # Vectorize the text
    vectorizer = vectorizers[vectorizer_name]
    X_train_vec = vectorizer.fit_transform(X_train)
    X_val_vec = vectorizer.transform(X_val)

    # Train the model
    model = models[model_name]
    model.fit(X_train_vec, y_train)

    # Make predictions
    y_pred = model.predict(X_val_vec)

    # Calculate macro F1 score (as mentioned in the problem)
    macro_f1 = f1_score(y_val, y_pred, average='macro')
    score = 100 * macro_f1

    print(f"Macro F1 Score: {macro_f1:.4f}")
    print(f"Evaluation Score: {score:.2f}")

    return score, vectorizer, model

# Test a few promising combinations first (to avoid long computation)
best_score = 0
best_combo = None
best_vectorizer = None
best_model = None

# Start with the most promising combinations
promising_combos = [
    ('tfidf_ngram', 'logistic_regression'),
    ('tfidf_advanced', 'logistic_regression'),
    ('tfidf_basic', 'logistic_regression'),
    ('tfidf_ngram', 'naive_bayes'),
    ('count_ngram', 'naive_bayes')
]

results = []

for vec_name, model_name in promising_combos:
    try:
        score, vectorizer, model = evaluate_model(vec_name, model_name, X_train, X_val, y_train, y_val, vectorizers, models)
        results.append((vec_name, model_name, score))

        if score > best_score:
            best_score = score
            best_combo = (vec_name, model_name)
            best_vectorizer = vectorizer
            best_model = model

    except Exception as e:
        print(f"Error with {vec_name} + {model_name}: {str(e)}")
        continue

print(f"\n=== RESULTS ===")
for vec_name, model_name, score in sorted(results, key=lambda x: x[2], reverse=True):
    print(f"{vec_name} + {model_name}: {score:.2f}")

print(f"\nBest combination: {best_combo[0]} + {best_combo[1]} with score: {best_score:.2f}")


Evaluating tfidf_ngram + logistic_regression...
Macro F1 Score: 0.8678
Evaluation Score: 86.78

Evaluating tfidf_advanced + logistic_regression...
Macro F1 Score: 0.8665
Evaluation Score: 86.65

Evaluating tfidf_basic + logistic_regression...
Macro F1 Score: 0.8655
Evaluation Score: 86.55

Evaluating tfidf_ngram + naive_bayes...
Macro F1 Score: 0.8503
Evaluation Score: 85.03

Evaluating count_ngram + naive_bayes...
Macro F1 Score: 0.8751
Evaluation Score: 87.51

=== RESULTS ===
count_ngram + naive_bayes: 87.51
tfidf_ngram + logistic_regression: 86.78
tfidf_advanced + logistic_regression: 86.65
tfidf_basic + logistic_regression: 86.55
tfidf_ngram + naive_bayes: 85.03

Best combination: count_ngram + naive_bayes with score: 87.51


In [None]:
# Let's get detailed results for the best model
print("Detailed evaluation for best model: count_ngram + naive_bayes")

# Use the best combination
vectorizer = CountVectorizer(max_features=15000, ngram_range=(1, 2), stop_words='english', max_df=0.95, min_df=2)
model = MultinomialNB()

# Fit on training data
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
model.fit(X_train_vec, y_train)

# Get predictions
y_pred = model.predict(X_val_vec)

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

# Confusion matrix
print("\nConfusion Matrix:")
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_val, y_pred)
print(cm)

# Per-class F1 scores
from sklearn.metrics import f1_score
per_class_f1 = f1_score(y_val, y_pred, average=None)
classes = sorted(y_val.unique())
print("\nPer-class F1 scores:")
for class_name, f1 in zip(classes, per_class_f1):
    print(f"{class_name}: {f1:.4f}")

print(f"\nMacro F1: {f1_score(y_val, y_pred, average='macro'):.4f}")
print(f"Weighted F1: {f1_score(y_val, y_pred, average='weighted'):.4f}")
print(f"Final Score: {100 * f1_score(y_val, y_pred, average='macro'):.2f}")

Detailed evaluation for best model: count_ngram + naive_bayes

Classification Report:
                   precision    recall  f1-score   support

Computer Sciences       0.86      0.94      0.90       208
           Gaming       0.86      0.89      0.88       328
        Geography       0.89      0.83      0.86       283
          History       0.89      0.89      0.89       148
 Natural Sciences       0.86      0.91      0.89       278
      Pop Culture       0.88      0.77      0.82       313
           Sports       0.89      0.91      0.90       442

         accuracy                           0.88      2000
        macro avg       0.88      0.88      0.88      2000
     weighted avg       0.88      0.88      0.87      2000


Confusion Matrix:
[[196   4   1   0   4   2   1]
 [  6 292   7   2   5   7   9]
 [  4   5 234   3  20  10   7]
 [  1   4   8 132   0   1   2]
 [  7   4   3   2 254   3   5]
 [ 12  21   4   4   5 240  27]
 [  3   8   7   5   6  11 402]]

Per-class F1 scores:
Com

In [None]:
# Let's also try an ensemble approach or hyperparameter tuning for better results
# First, let's try some hyperparameter tuning for the CountVectorizer + NB combination

from sklearn.model_selection import GridSearchCV

print("Performing hyperparameter tuning for CountVectorizer + MultinomialNB...")

# Define parameter grids
vectorizer_params = [
    {'max_features': [10000, 15000, 20000], 'min_df': [2, 3, 5], 'max_df': [0.9, 0.95, 0.98]},
]

nb_params = [
    {'alpha': [0.1, 0.5, 1.0, 2.0]}
]

best_tuned_score = 0
best_tuned_vectorizer = None
best_tuned_model = None

# Try different vectorizer parameters
for vec_params in vectorizer_params:
    for max_feat in vec_params['max_features']:
        for min_df in vec_params['min_df']:
            for max_df in vec_params['max_df']:
                print(f"Testing max_features={max_feat}, min_df={min_df}, max_df={max_df}")

                vectorizer = CountVectorizer(max_features=max_feat, ngram_range=(1, 2),
                                           stop_words='english', max_df=max_df, min_df=min_df)

                X_train_vec = vectorizer.fit_transform(X_train)
                X_val_vec = vectorizer.transform(X_val)

                # Try different alpha values for MultinomialNB
                for alpha in [0.1, 0.5, 1.0, 2.0]:
                    model = MultinomialNB(alpha=alpha)
                    model.fit(X_train_vec, y_train)

                    y_pred = model.predict(X_val_vec)
                    score = 100 * f1_score(y_val, y_pred, average='macro')

                    if score > best_tuned_score:
                        best_tuned_score = score
                        best_tuned_vectorizer = vectorizer
                        best_tuned_model = model
                        print(f"  New best score: {score:.2f} (alpha={alpha})")

print(f"\nBest tuned score: {best_tuned_score:.2f}")

# If tuning didn't improve much, let's stick with the original best model
if best_tuned_score <= best_score + 0.5:  # If improvement is minimal
    print("Tuning didn't provide significant improvement. Using original best model.")
    final_vectorizer = CountVectorizer(max_features=15000, ngram_range=(1, 2), stop_words='english', max_df=0.95, min_df=2)
    final_model = MultinomialNB()
else:
    print("Using tuned model.")
    final_vectorizer = best_tuned_vectorizer
    final_model = best_tuned_model

Performing hyperparameter tuning for CountVectorizer + MultinomialNB...
Testing max_features=10000, min_df=2, max_df=0.9
  New best score: 87.38 (alpha=0.1)
  New best score: 87.42 (alpha=0.5)
Testing max_features=10000, min_df=2, max_df=0.95
Testing max_features=10000, min_df=2, max_df=0.98
Testing max_features=10000, min_df=3, max_df=0.9
Testing max_features=10000, min_df=3, max_df=0.95
Testing max_features=10000, min_df=3, max_df=0.98
Testing max_features=10000, min_df=5, max_df=0.9
Testing max_features=10000, min_df=5, max_df=0.95
Testing max_features=10000, min_df=5, max_df=0.98
Testing max_features=15000, min_df=2, max_df=0.9
  New best score: 88.27 (alpha=0.1)
Testing max_features=15000, min_df=2, max_df=0.95
Testing max_features=15000, min_df=2, max_df=0.98
Testing max_features=15000, min_df=3, max_df=0.9
Testing max_features=15000, min_df=3, max_df=0.95
Testing max_features=15000, min_df=3, max_df=0.98
Testing max_features=15000, min_df=5, max_df=0.9
Testing max_features=15000

In [None]:
# Now let's train the final model on the entire training dataset and make predictions on the test set

print("Training final model on entire training dataset...")

# Use the best parameters found: max_features=20000, min_df=2, max_df=0.9, alpha=0.5
final_vectorizer = CountVectorizer(max_features=20000, ngram_range=(1, 2),
                                 stop_words='english', max_df=0.9, min_df=2)
final_model = MultinomialNB(alpha=0.5)

# Train on the entire training set
X_full_train = train_df['processed_text']
y_full_train = train_df['Subject']

X_full_train_vec = final_vectorizer.fit_transform(X_full_train)
final_model.fit(X_full_train_vec, y_full_train)

print("Final model trained successfully!")

# Make predictions on the test set
print("Making predictions on test set...")
X_test_processed = test_df['processed_text']
X_test_vec = final_vectorizer.transform(X_test_processed)
test_predictions = final_model.predict(X_test_vec)

print("Predictions completed!")

# Create submission dataframe
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'Subject': test_predictions
})

print(f"Submission shape: {submission.shape}")
print(f"Unique predictions: {submission['Subject'].unique()}")
print(f"Prediction distribution:")
print(submission['Subject'].value_counts())

# Display first few predictions
print(f"\nFirst 10 predictions:")
print(submission.head(10))

Training final model on entire training dataset...
Final model trained successfully!
Making predictions on test set...
Predictions completed!
Submission shape: (4020, 2)
Unique predictions: ['Gaming' 'Pop Culture' 'Computer Sciences' 'Geography' 'Natural Sciences'
 'History' 'Sports']
Prediction distribution:
Subject
Sports               909
Gaming               645
Pop Culture          593
Natural Sciences     552
Geography            541
Computer Sciences    516
History              264
Name: count, dtype: int64

First 10 predictions:
          ID            Subject
0  test_0001             Gaming
1  test_0002        Pop Culture
2  test_0003  Computer Sciences
3  test_0004          Geography
4  test_0005        Pop Culture
5  test_0006             Gaming
6  test_0007   Natural Sciences
7  test_0008            History
8  test_0009        Pop Culture
9  test_0010             Sports


In [None]:
# Let's validate our model once more using cross-validation on the full training set
from sklearn.model_selection import cross_val_score

print("Performing 5-fold cross-validation on full training set...")

# Use the final model configuration
final_vectorizer_cv = CountVectorizer(max_features=20000, ngram_range=(1, 2),
                                    stop_words='english', max_df=0.9, min_df=2)
final_model_cv = MultinomialNB(alpha=0.5)

# Transform the text
X_full_vec = final_vectorizer_cv.fit_transform(X_full_train)

# Perform cross-validation
cv_scores = cross_val_score(final_model_cv, X_full_vec, y_full_train,
                           cv=5, scoring='f1_macro')

print(f"Cross-validation F1 macro scores: {cv_scores}")
print(f"Mean CV F1 macro: {cv_scores.mean():.4f}")
print(f"Std CV F1 macro: {cv_scores.std():.4f}")
print(f"Mean CV Score (100 * F1): {100 * cv_scores.mean():.2f}")

# Save the predictions to CSV
submission.to_csv('prediction.csv', index=False)
print(f"\nPredictions saved to 'prediction.csv'")

# Verify the submission format
print(f"\nVerifying submission format:")
print(f"Shape: {submission.shape}")
print(f"Columns: {submission.columns.tolist()}")
print(f"Index matches test file IDs: {submission['ID'].equals(test_df['ID'])}")
print(f"All subjects are valid: {set(submission['Subject'].unique()).issubset(set(train_df['Subject'].unique()))}")

print("\n=== FINAL MODEL SUMMARY ===")
print(f"Vectorizer: CountVectorizer with max_features=20000, ngram_range=(1,2), max_df=0.9, min_df=2")
print(f"Model: MultinomialNB with alpha=0.5")
print(f"Expected performance (CV): {100 * cv_scores.mean():.2f} ± {100 * cv_scores.std():.2f}")
print(f"Submission file: prediction.csv ({submission.shape[0]} predictions)")

submission.head(10)

Performing 5-fold cross-validation on full training set...
Cross-validation F1 macro scores: [0.88215912 0.8876484  0.90033131 0.86713556 0.8694774 ]
Mean CV F1 macro: 0.8814
Std CV F1 macro: 0.0122
Mean CV Score (100 * F1): 88.14

Predictions saved to 'submission.csv'

Verifying submission format:
Shape: (4020, 2)
Columns: ['ID', 'Subject']
Index matches test file IDs: True
All subjects are valid: True

=== FINAL MODEL SUMMARY ===
Vectorizer: CountVectorizer with max_features=20000, ngram_range=(1,2), max_df=0.9, min_df=2
Model: MultinomialNB with alpha=0.5
Expected performance (CV): 88.14 ± 1.22
Submission file: submission.csv (4020 predictions)


Unnamed: 0,ID,Subject
0,test_0001,Gaming
1,test_0002,Pop Culture
2,test_0003,Computer Sciences
3,test_0004,Geography
4,test_0005,Pop Culture
5,test_0006,Gaming
6,test_0007,Natural Sciences
7,test_0008,History
8,test_0009,Pop Culture
9,test_0010,Sports


In [None]:
# Fix the cross-validation call and complete the submission
from sklearn.model_selection import cross_val_score, StratifiedKFold

print("Performing 5-fold cross-validation on full training set...")

# Use the final model configuration
final_vectorizer_cv = CountVectorizer(max_features=20000, ngram_range=(1, 2),
                                    stop_words='english', max_df=0.9, min_df=2)
final_model_cv = MultinomialNB(alpha=0.5)

# Transform the text
X_full_vec = final_vectorizer_cv.fit_transform(X_full_train)

# Perform cross-validation with StratifiedKFold for reproducibility
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(final_model_cv, X_full_vec, y_full_train,
                           cv=skf, scoring='f1_macro')

print(f"Cross-validation F1 macro scores: {cv_scores}")
print(f"Mean CV F1 macro: {cv_scores.mean():.4f}")
print(f"Std CV F1 macro: {cv_scores.std():.4f}")
print(f"Mean CV Score (100 * F1): {100 * cv_scores.mean():.2f}")

# Save the predictions to CSV
submission.to_csv('prediction.csv', index=False)
print(f"\nPredictions saved to 'prediction.csv'")

# Verify the submission format
print(f"\nVerifying submission format:")
print(f"Shape: {submission.shape}")
print(f"Columns: {submission.columns.tolist()}")
print(f"Index matches test file IDs: {submission['ID'].equals(test_df['ID'])}")
print(f"All subjects are valid: {set(submission['Subject'].unique()).issubset(set(train_df['Subject'].unique()))}")

print("\n=== FINAL MODEL SUMMARY ===")
print(f"Vectorizer: CountVectorizer with max_features=20000, ngram_range=(1,2), max_df=0.9, min_df=2")
print(f"Model: MultinomialNB with alpha=0.5")
print(f"Expected performance (CV): {100 * cv_scores.mean():.2f} ± {100 * cv_scores.std():.2f}")
print(f"Submission file: prediction.csv ({submission.shape[0]} predictions)")

# Check if we have all required files
import os
files_in_dir = os.listdir('.')
print(f"\nFiles created: {[f for f in files_in_dir if f.endswith('.csv')]}")
print("prediction.csv ready for upload!")

Performing 5-fold cross-validation on full training set...
Cross-validation F1 macro scores: [0.87638371 0.88323856 0.8837605  0.87485903 0.88994307]
Mean CV F1 macro: 0.8816
Std CV F1 macro: 0.0055
Mean CV Score (100 * F1): 88.16

Predictions saved to 'submission.csv'

Verifying submission format:
Shape: (4020, 2)
Columns: ['ID', 'Subject']
Index matches test file IDs: True
All subjects are valid: True

=== FINAL MODEL SUMMARY ===
Vectorizer: CountVectorizer with max_features=20000, ngram_range=(1,2), max_df=0.9, min_df=2
Model: MultinomialNB with alpha=0.5
Expected performance (CV): 88.16 ± 0.55
Submission file: submission.csv (4020 predictions)

Files created: ['train.csv', 'submission.csv', 'sample_submission.csv', 'test.csv']
submission.csv ready for upload!
