In [46]:
import os
import pandas as pd
import prompts as p


# Load the .env file

TEST_PATH = "../../Datasets/MAMS-ATSA/Downsampled/test/test_single_row.csv"
TRAIN_PATH = "../../Datasets/MAMS-ATSA/Downsampled/train/train_single_row.csv"

OUTPUT_PATH = "../../Datasets/Evaluations/Sentiment Analysis/logistic_regression.csv"




In [47]:
df_train = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH)

# Transform columns to strings
df_train['text'] = df_train['text'].astype(str)
df_train['term'] = df_train['term'].astype(str)
df_train['polarity'] = df_train['polarity'].astype(str)

df_test['text'] = df_test['text'].astype(str)
df_test['term'] = df_test['term'].astype(str)
df_test['polarity'] = df_test['polarity'].astype(str)

print(len(df_train))
print(len(df_test))


11186
451


In [48]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import accuracy_score, classification_report
from sklearn.base import BaseEstimator, TransformerMixin

def preprocess_text(text):
    return text.lower()

class ColumnExtractor(TransformerMixin, BaseEstimator):
    def __init__(self, column):
        self.column = column
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            return X[self.column]
        else:
            raise ValueError("Input must be a DataFrame")

text_preprocessor = Pipeline([
    ('extractor', ColumnExtractor(column='text')),
    ('vectorizer', TfidfVectorizer(preprocessor=preprocess_text))
])

term_preprocessor = Pipeline([
    ('extractor', ColumnExtractor(column='term')),
    ('vectorizer', TfidfVectorizer(preprocessor=preprocess_text))
])

feature_processing = FeatureUnion([
    ('text_features', text_preprocessor),
    ('term_features', term_preprocessor)
])

pipeline = Pipeline([
    ('features', feature_processing),
    ('classifier', LogisticRegression(random_state=42, C=1, penalty='l2', solver='saga'))
])

X_train = df_train[['text', 'term']]
y_train = df_train['polarity']
X_test = df_test[['text', 'term']]
y_test = df_test['polarity']

pipeline.fit(X_train, y_train)

y_train_pred = pipeline.predict(X_train)
print("Training Set Evaluation:")
print(f"Accuracy: {accuracy_score(y_train, y_train_pred)}")
print(classification_report(y_train, y_train_pred))

y_test_pred = pipeline.predict(X_test)

df_test['polarity_pred'] = y_test_pred

print("Test Set Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(classification_report(y_test, y_test_pred))




Training Set Evaluation:
Accuracy: 0.7855354907920615
              precision    recall  f1-score   support

    negative       0.79      0.71      0.75      2764
     neutral       0.80      0.87      0.83      5042
    positive       0.76      0.72      0.74      3380

    accuracy                           0.79     11186
   macro avg       0.78      0.77      0.77     11186
weighted avg       0.78      0.79      0.78     11186

Test Set Evaluation:
Accuracy: 0.6496674057649667
              precision    recall  f1-score   support

    negative       0.67      0.62      0.64       117
     neutral       0.67      0.80      0.73       194
    positive       0.59      0.47      0.53       140

    accuracy                           0.65       451
   macro avg       0.64      0.63      0.63       451
weighted avg       0.65      0.65      0.64       451



In [51]:
# Ensure that 'text_preprocessor' and 'pipeline' are already defined and fitted as per previous examples

# Extract the feature names (tokens) and their corresponding coefficients (weights) from the model
vectorizer = text_preprocessor.named_steps['vectorizer']
classifier = pipeline.named_steps['classifier']
feature_names = vectorizer.get_feature_names_out()
coefficients = classifier.coef_[0]

# Create a dictionary for easy lookup of token weights
token_weights_dict = dict(zip(feature_names, coefficients))

# Function to map text to token weights
def map_text_to_token_weights(text):
    # Tokenize the input text using the same method as the vectorizer
    tokens = vectorizer.build_analyzer()(text)
    # Retrieve and format the weights for each token found in the model's vocabulary
    token_weights = [f"{token} : {token_weights_dict[token]}" for token in tokens if token in token_weights_dict]
    return ', '.join(token_weights)

# Apply the function to the 'text' column of df_test and create a new column 'token_weights'
df_test['token_weights'] = df_test['text'].apply(map_text_to_token_weights)

# Display the updated DataFrame (or save/export as needed)
for i in range(20):
    print(df_test['text'][i])
    print(df_test['token_weights'][i])



Amusing details distinguish desserts, from dulce de leche ice-cream profiteroles dotting a chocolate sauce tic-tac-toe board, to coconut custard surrounded by a sea of Malibu-rum gelee and poached pineapple.
details : 0.03391598521293542, distinguish : -0.09060113082679158, desserts : 0.19248726523508278, from : -0.5657247889759164, de : 0.17945842872799, leche : -0.017470373128488346, ice : -0.4998306600951552, cream : -0.12930417399686137, chocolate : -0.06244679606010882, sauce : 0.093990918074225, board : 0.06681904708332484, to : 0.06489666268372277, coconut : -0.36438710196307356, custard : -0.21438130042712866, surrounded : -0.27077686792325767, by : -0.38154381418577143, sea : 0.24051716214945149, of : -0.6721133239838706, malibu : 0.05987580658035815, rum : -0.1288084155438607, and : -1.4714550858778395, poached : 0.08233233182375482, pineapple : -0.27647806259917757
Amusing details distinguish desserts, from dulce de leche ice-cream profiteroles dotting a chocolate sauce tic-

In [52]:
print(df_test.head())

   text_id  term_id                                               text  \
0        0        0  Amusing details distinguish desserts, from dul...   
1        0        1  Amusing details distinguish desserts, from dul...   
2        0        2  Amusing details distinguish desserts, from dul...   
3        0        3  Amusing details distinguish desserts, from dul...   
4        1        4  The server was so busy the night we visited th...   

                          term  polarity polarity_pred  \
0                     desserts  positive       neutral   
1     dulce de leche ice-cream   neutral       neutral   
2  chocolate sauce tic-tac-toe   neutral      positive   
3            poached pineapple   neutral       neutral   
4                       server  negative      negative   

                                       token_weights  
0  details : 0.03391598521293542, distinguish : -...  
1  details : 0.03391598521293542, distinguish : -...  
2  details : 0.03391598521293542, disting

In [32]:
df_test.to_csv(OUTPUT_PATH, index = True)