In [6]:
import os
import warnings
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import mlflow
import mlflow.sklearn
from urllib.parse import urlparse
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import logging
from sklearn.feature_extraction.text import TfidfVectorizer

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

# Function to evaluate metrics
def eval_metrics(actual, pred):
    accuracy = accuracy_score(actual, pred)
    precision = precision_score(actual, pred, average='weighted')
    recall = recall_score(actual, pred, average='weighted')
    f1 = f1_score(actual, pred, average='weighted')
    return accuracy, precision, recall, f1

# Function for text preprocessing

# Download NLTK stopwords and punkt if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    """
    Preprocesses the text by normalizing, removing special characters, punctuation,
    stop words, and HTML tags.
    
    Parameters:
    - text (str): The input text to preprocess.
    
    Returns:
    - str: The preprocessed text.
    """
    # 1. Convert to lowercase
    text = text.lower()

    # 2. Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # 3. Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)

    # 4. Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # 5. Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # 6. Tokenize the text
    tokens = word_tokenize(text)

    # 7. Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # 8. Rejoin tokens into a single string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

# def preprocess_text(text):
#     # Add your text preprocessing steps here (normalization, removing special characters, etc.)
#     return text

if __name__ == "__main__":
    warnings.filterwarnings("ignore")
    np.random.seed(40)

     # Set the experiment name
    mlflow.set_experiment("NLP_Twitter_experiment")

    # Load the Twitter sentiment dataset
    data = pd.read_csv("twitter_dataset.csv")  # Update with your dataset path
    data['text'] = data['text'].apply(preprocess_text)

    # Encode the labels (assuming they are in 'airline_sentiment' column)
    data['label'] = data['airline_sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})

    # Split the data into training and test sets (0.75, 0.25)
    train, test = train_test_split(data, test_size=0.25, random_state=42)

    train_x = train['text']  # Features
    train_y = train['label']  # Labels
    test_x = test['text']
    test_y = test['label']

    # Convert text to numerical representation (e.g., using TfidfVectorizer)
    vectorizer = TfidfVectorizer()
    train_x_vectorized = vectorizer.fit_transform(train_x)
    test_x_vectorized = vectorizer.transform(test_x)

    # Set the tracking URI for MLflow (remote server)
    remote_server_uri = "http://ec2-18-234-243-105.compute-1.amazonaws.com:5000/"
    mlflow.set_tracking_uri(remote_server_uri)

    # Start MLflow run
    with mlflow.start_run(run_name="Logistic Regression"):
        model = LogisticRegression(max_iter=1000)
        model.fit(train_x_vectorized, train_y)

        # Predictions
        predicted_labels = model.predict(test_x_vectorized)
        accuracy, precision, recall, f1 = eval_metrics(test_y, predicted_labels)

        # Log parameters and metrics
        mlflow.log_param("model_type", "Logistic Regression")
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

        # Log the model
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        # Model registry does not work with file store
        if tracking_url_type_store != "file":
            mlflow.sklearn.log_model(model, "model", registered_model_name="TwitterSentimentModel")
        else:
            mlflow.sklearn.log_model(model, "model")

        print("Model logged and registered successfully!")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\acer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2024/09/19 00:29:13 INFO mlflow.tracking.fluent: Experiment with name 'NLP_Twitter_experiment' does not exist. Creating a new experiment.
Registered model 'TwitterSentimentModel' already exists. Creating a new version of this model...
2024/09/19 00:29:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: TwitterSentimentModel, version 2
Created version '2' of model 'TwitterSentimentModel'.


Model logged and registered successfully!


2024/09/19 00:29:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression at: http://ec2-18-234-243-105.compute-1.amazonaws.com:5000/#/experiments/389111837301992168/runs/359671704bb1493c826973cd7ea3ae33.
2024/09/19 00:29:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://ec2-18-234-243-105.compute-1.amazonaws.com:5000/#/experiments/389111837301992168.
