<a href="https://colab.research.google.com/github/tommasomncttn/NLP-Disaster-Tweet-Detection/blob/main/LOGISTIC_REGRESSION_NB(SKLEARN).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# Importing necessary libraries
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/ML_proj/clean_data.csv")

In [15]:
# Extracting the text and target labels from the training and testing data
text_df = df["text"]
lable_df = df["target"]

# Creating a CountVectorizer object to convert the text into a matrix of token counts
vect = CountVectorizer(stop_words='english')

# Fitting the CountVectorizer on the training data to learn the vocabulary and create a document-term matrix
vectorized_train = vect.fit_transform(text_df)

# Creating a logistic regression model
model = LogisticRegression(max_iter = 500)

# Hyperparameter search
# param_grid = {'C': np.logspace(-3, 3, 7), 'penalty': ['l2', None]}
# model_H = GridSearchCV(model,param_grid,cv=3)

In [17]:
# Define the number of folds to use for cross-validation
num_folds = 5

# Create a KFold object to split the data into K folds
kf = KFold(n_splits=num_folds, shuffle=True)

# Create an empty list to store F1 scores for each fold
f1_scores = []

# Loop over each fold and train the model on the training data, then evaluate on the validation data
for fold, (train_indices, val_indices) in enumerate(kf.split(vectorized_train, lable_df)):

    # Split the data into training and validation sets for this fold
    X_train, y_train = vectorized_train[train_indices], lable_df[train_indices]
    X_val, y_val = vectorized_train[val_indices], lable_df[val_indices]

    # Fit a logistic regression model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the validation data using the trained model
    predicted_labels = model.predict(X_val)

    # Evaluate the performance of the model using F1 score
    f1 = f1_score(predicted_labels, y_val) 

    # Add the F1 score for this fold to the list
    f1_scores.append(f1)

    # Print the F1 score for this fold
    print(f"Fold {fold}: F1 score = {f1}")

# Calculate the average F1 score across all folds
avg_score = np.mean(f1_scores)
print(f"Average F1 score across {num_folds} folds: {avg_score}")

Fold 0: F1 score = 0.7459086993970715
Fold 1: F1 score = 0.7750200160128103
Fold 2: F1 score = 0.7373572593800979
Fold 3: F1 score = 0.7436762225969645
Fold 4: F1 score = 0.7334963325183375
Average F1 score across 5 folds: 0.7470917059810563
