In [38]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In [39]:
# Load data into DataFrames
train_df = pd.read_csv('news_train.csv')
test_df = pd.read_csv('test.csv')

In [40]:
# Create a seperate text column containing both features
train_df['text'] = train_df['News_title'] + ' ' + train_df['News_headline']
test_df['text'] = test_df['News_title'] + ' ' + test_df['News_headline']

In [43]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialize the lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Check if the text is a string, if not return empty string
    if not isinstance(text, str):
        return ""
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    tokens = text.split()
    # Remove stop words and lemmatize
    processed_tokens = [
        lemmatizer.lemmatize(word) for word in tokens if word not in stop_words
    ]
    return ' '.join(processed_tokens)

# Replace NaN values with an empty string
train_df['text'] = train_df['text'].fillna("")
test_df['text'] = test_df['text'].fillna("")

In [44]:
# Apply above preprocessing techniques to train and test Datasets.
train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

In [45]:
# Split Dataset into training and validation Dataset.
X = train_df['text']
y = train_df['Category']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [46]:
# Convert dataset text into TF-IDF features.
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')

In [47]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(test_df['text'])

In [57]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Define the classifiers
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier()
}

# Define the hyperparameters for grid search
param_grid = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'lbfgs']
    },
    'Naive Bayes': {
        'alpha': [0.01, 0.1, 1, 10]
    },
    'Support Vector Machine': {
        'C': [0.01, 0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    },
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30]
    }
}

# Perform grid search
best_models = {}
for model_name, model in models.items():
    print(f"Running grid search for {model_name}...")
    grid_search = GridSearchCV(model, param_grid[model_name], cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_tfidf, y_train)
    best_models[model_name] = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")

# Validate the best models
for model_name, model in best_models.items():
    y_val_pred = model.predict(X_val_tfidf)
    print(f"Validation report for {model_name}:")
    print(classification_report(y_val, y_val_pred))

Running grid search for Logistic Regression...
Best parameters for Logistic Regression: {'C': 10, 'solver': 'liblinear'}
Running grid search for Naive Bayes...
Best parameters for Naive Bayes: {'alpha': 0.1}
Running grid search for Support Vector Machine...
Best parameters for Support Vector Machine: {'C': 1, 'kernel': 'linear'}
Running grid search for Random Forest...
Best parameters for Random Forest: {'max_depth': None, 'n_estimators': 200}
Validation report for Logistic Regression:
              precision    recall  f1-score   support

        Arts       0.71      0.57      0.63       168
    business       0.69      0.51      0.58       130
      humour       0.53      0.24      0.33       188
    politics       0.88      0.97      0.92      2304
      sports       0.90      0.82      0.85       180
        tech       0.87      0.66      0.75       146

    accuracy                           0.86      3116
   macro avg       0.76      0.63      0.68      3116
weighted avg       0.

In [59]:
# Since LogisticRegression(C=10, solver='liblinear') got the best results in gridsearchcv use that to train in full Dataset.
final_model=LogisticRegression(C=10, solver='liblinear')
final_model.fit(X_train_tfidf, y_train)

In [62]:
# Predict the model on our own validation dataset to see results.
y_val_pred = final_model.predict(X_val_tfidf)

In [63]:
category_mapping = {
    'Arts': 0,
    'business': 1,
    'humour': 2,
    'politics': 3,
    'sports': 4,
    'tech': 5
}

print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred, target_names=list(category_mapping.keys())))

Validation Accuracy: 0.8597560975609756
              precision    recall  f1-score   support

        Arts       0.71      0.57      0.63       168
    business       0.69      0.51      0.58       130
      humour       0.53      0.24      0.33       188
    politics       0.88      0.97      0.92      2304
      sports       0.90      0.82      0.85       180
        tech       0.87      0.66      0.75       146

    accuracy                           0.86      3116
   macro avg       0.76      0.63      0.68      3116
weighted avg       0.84      0.86      0.85      3116



In [64]:
# Predict on test dataset
test_pred = final_model.predict(X_test_tfidf)

In [65]:
# Store in format for Kaggle competition.
submission = pd.DataFrame({'ID': test_df['ID'], 'Category': test_pred})
submission['Category'] = submission['Category'].map(category_mapping)
submission[40:50]

Unnamed: 0,ID,Category
40,41,3
41,42,3
42,43,3
43,44,3
44,45,3
45,46,3
46,47,0
47,48,3
48,49,5
49,50,3


In [66]:
# Store results in csv file
submission.to_csv('submission.csv', index=False)