In [None]:
# sentiment_model_custom_columns.py

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

if __name__ == "__main__":
    # Update paths to your dataset files
    TRAINING_FILE = "/content/drive/MyDrive/Colab Notebooks/twitter_training.csv"
    VALIDATION_FILE = "/content/drive/MyDrive/Colab Notebooks/twitter_validation.csv"


    train_df = pd.read_csv(TRAINING_FILE, header=None)
        # Assign meaningful column names
    train_df.columns = ['ID', 'Platform', 'Sentiment', 'Text']

        # Drop rows with missing text
    train_df = train_df.dropna(subset=['Text'])
    print("First few rows after loading and cleaning:")
    print(train_df.head())

    val_df = pd.read_csv(VALIDATION_FILE, header=None)
        # Assign meaningful column names
    val_df.columns = ['ID', 'Platform', 'Sentiment', 'Text']

        # Drop rows with missing text
    val_df = val_df.dropna(subset=['Text'])
    print("First few rows after loading and cleaning:")
    print(val_df.head())

    classifier, vectorizer = preprocess_and_train(train_df)
    X_val = vectorizer.transform(val_df['Text'])
    y_val = val_df['Sentiment']

    predictions = classifier.predict(X_val)
    print("\nClassification Report:\n", classification_report(y_val, predictions))
    print("Accuracy Score:", accuracy_score(y_val, predictions))

def preprocess_and_train(train_df):
    """
    Preprocesses the text data and trains a logistic regression model.

    :param train_df: DataFrame containing 'Text' and 'Sentiment' columns for training.
    :return: Trained classifier and vectorizer.
    """
    # Handle missing or non-string values
    train_df.info()
    #train_df['Text'] = train_df['Text'].fillna("").astype(str)

    # Check for completely empty or whitespace-only texts
    train_df = train_df[train_df['Text'].str.strip() != ""]

    if train_df.empty:
        raise ValueError("Training data is empty after preprocessing. Check your dataset.")

    vectorizer = TfidfVectorizer(max_features=5000, stop_words=None)  # Adjust stop_words=None to allow all words
    X_train = vectorizer.fit_transform(train_df['Text'])
    y_train = train_df['Sentiment']

    classifier = LogisticRegression(max_iter=1000)
    classifier.fit(X_train, y_train)

    return classifier, vectorizer


First few rows after loading and cleaning:
     ID     Platform Sentiment  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

                                                Text  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  
First few rows after loading and cleaning:
     ID   Platform   Sentiment  \
0  3364   Facebook  Irrelevant   
1   352     Amazon     Neutral   
2  8312  Microsoft    Negative   
3  4371      CS-GO    Negative   
4  4433     Google     Neutral   

                                                Text  
0  I mentioned on Facebook that I was struggling ...  
1  BBC News - Amazon boss Jeff Bezos rejects clai...  
2  @Micro

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Text'] = train_df['Text'].fillna("").astype(str)



Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.82      0.73      0.77       172
    Negative       0.79      0.88      0.83       266
     Neutral       0.86      0.77      0.81       285
    Positive       0.80      0.86      0.83       277

    accuracy                           0.82      1000
   macro avg       0.82      0.81      0.81      1000
weighted avg       0.82      0.82      0.82      1000

Accuracy Score: 0.816
