<a href="https://colab.research.google.com/github/yasminela/AI-ML/blob/main/Tokenize_and_lemmatize.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, make_scorer
from sklearn.preprocessing import LabelEncoder
from sklearn.exceptions import NotFittedError
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
import joblib
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Download necessary NLTK resources
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Define constants
STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()


def load_data(train_path, test_path):
    """Load train and test datasets."""
    try:
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        logging.info("Datasets loaded successfully.")
        return train_df, test_df
    except FileNotFoundError as e:
        logging.error(f"File not found: {e}")
        raise


def preprocess_text(text):
    """Preprocess text: lowercase, remove punctuation, lemmatize, and remove stopwords."""
    if not isinstance(text, str):
        return ''

    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize and lemmatize
    words = text.split()
    words = [LEMMATIZER.lemmatize(word) for word in words if word not in STOPWORDS]

    return ' '.join(words)


def prepare_data(df):
    """Prepare data by combining title and content, filling missing values, and preprocessing text."""
    df['title'] = df['title'].fillna('').astype(str)
    df['content'] = df['content'].fillna('').astype(str)
    df['text'] = df['title'] + ' ' + df['content']
    df['text_processed'] = df['text'].apply(preprocess_text)
    return df


def encode_labels(y, label_encoder=None):
    """Encode target labels."""
    if label_encoder is None:
        label_encoder = LabelEncoder()
        y_encoded = label_encoder.fit_transform(y)
    else:
        try:
            y_encoded = label_encoder.transform(y)
        except ValueError:
            logging.error("Label encoder not fitted or unknown labels encountered.")
            raise
    return y_encoded, label_encoder


def build_pipeline():
    """Build the machine learning pipeline."""
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
        ('clf', LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='auto'))
    ])
    return pipeline


def evaluate_model(model, X_val, y_val, label_encoder):
    """Evaluate the model on validation data."""
    try:
        y_pred = model.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        report = classification_report(y_val, y_pred, target_names=label_encoder.classes_, output_dict=True)
        logging.info(f"Validation Accuracy: {accuracy:.4f}")
        logging.info("Classification Report:\n" + pd.DataFrame(report).T.to_string())
        return accuracy, report
    except NotFittedError:
        logging.error("Model is not fitted yet.")
        raise


def tune_hyperparameters(pipeline, X_train, y_train):
    """Tune hyperparameters using GridSearchCV."""
    param_grid = {
        'tfidf__max_features': [5000, 10000],
        'tfidf__ngram_range': [(1, 1), (1, 2)],
        'clf__C': [0.1, 1, 10],
        'clf__solver': ['lbfgs', 'saga']
    }
    scorer = make_scorer(accuracy_score)
    grid_search = GridSearchCV(pipeline, param_grid, cv=StratifiedKFold(n_splits=5), scoring=scorer, n_jobs=-1)
    grid_search.fit(X_train,y_train)
    logging.info(f"Best Parameters: {grid_search.best_params_}")
    return grid_search.best_estimator_


def save_model(model, label_encoder, model_path="model.joblib", encoder_path="label_encoder.joblib"):
    """Save the trained model and label encoder."""
    joblib.dump(model, model_path)
    joblib.dump(label_encoder, encoder_path)
    logging.info("Model and label encoder saved successfully.")


def main():
    # Load datasets
    train_path = '/content/train.csv'
    test_path = '/content/test.csv'
    train_df, test_df = load_data(train_path, test_path)

    # Prepare data
    train_df = prepare_data(train_df)
    test_df = prepare_data(test_df)

    # Encode target labels
    y_train, label_encoder = encode_labels(train_df['target'])
    X_train = train_df['text_processed']

    # Split data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.1, random_state=42, stratify=y_train
    )

    # Build and tune the pipeline
    pipeline = build_pipeline()
    tuned_pipeline = tune_hyperparameters(pipeline, X_train, y_train)

    # Evaluate the model
    evaluate_model(tuned_pipeline, X_val, y_val, label_encoder)

    # Predict on test data
    X_test = test_df['text_processed']
    test_predictions = tuned_pipeline.predict(X_test)
    predicted_categories = label_encoder.inverse_transform(test_predictions)

    # Create submission file
    submission_df = pd.DataFrame({
        'id': test_df['id'],
        'target': predicted_categories
    })
    submission_df = submission_df.sort_values(by='id').reset_index(drop=True)
    submission_df.to_csv('submission.csv', index=False)
    logging.info("Submission file saved successfully.")

    # Save the model and label encoder
    save_model(tuned_pipeline, label_encoder)


if __name__ == "__main__":
    main()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
ERROR:root:File not found: [Errno 2] No such file or directory: '/content/train.csv'


FileNotFoundError: [Errno 2] No such file or directory: '/content/train.csv'