In [22]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import numpy as np
from datasets import Dataset
import pandas as pd
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [None]:


model_name = "microsoft/Phi-3-mini-4k-instruct"

# Load the model without custom quantization config
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",
)

model.config.use_cache = False
model.config.pretraining_tp = 1

max_seq_length = 2048
tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          trust_remote_code=True,
                                          max_seq_length=max_seq_length,
                                         )
tokenizer.pad_token = tokenizer.eos_token


In [8]:
filename = "synData.csv"

df = pd.read_csv(filename, 
                 names=["sentiment", "text"],
                 encoding="utf-8", encoding_errors="replace")

X_train = list()
X_test = list()
sentiments=df.sentiment.unique()
sentiment_string="\n ".join(sentiments)
for sentiment in sentiments:
    train, test  = train_test_split(df[df.sentiment==sentiment], 
                                    train_size=0.8,
                                    test_size=0.2, 
                                    stratify=df[df.sentiment==sentiment].sentiment,
                                    random_state=42)
    X_train.append(train)
    X_test.append(test)

X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_test = pd.concat(X_test)

eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]
X_eval = df[df.index.isin(eval_idx)]
X_eval = (X_eval
          .groupby('sentiment', group_keys=False)
          .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))
X_train = X_train.reset_index(drop=True)

def generate_prompt(data_point):
    return f"""The sentiment of the following phrase: '{data_point["text"]}' is \n{sentiment_string}\nCannot be determined
            \n\nSolution: The correct option is {data_point["sentiment"]}""".strip()

def generate_test_prompt(data_point):
    return f"""The sentiment of the following phrase: '{data_point["text"]}' is \n{sentiment_string}\nCannot be determined
            \n\nSolution: The correct option is""".strip()

X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1), 
                       columns=["text"])
X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1), 
                      columns=["text"])

y_true = X_test.sentiment
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)

  .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))


In [9]:
for index,row in X_train.iterrows():
    print(row["text"])
    break

The sentiment of the following phrase: '"MAKE MONEY FAST!!! I just made $10,000 in one week using this one simple trick! Check out the link in my profile to learn how you can do it too!!"' is 
positive
 spam
 neutral
 misleading
Cannot be determined
            

Solution: The correct option is spam


In [10]:
sentiments
sorted_sentiments = sorted(sentiments)

mapping = {sentiment: index for index, sentiment in enumerate(sorted_sentiments)}
print(mapping)

{'misleading': 0, 'neutral': 1, 'positive': 2, 'spam': 3}


In [11]:
list(mapping.values())

[0, 1, 2, 3]

In [12]:
def evaluate(y_true, y_pred,sentiments):
    sorted_sentiments = sorted(sentiments)

    labels = sorted_sentiments
    mapping = {sentiment: index for index, sentiment in enumerate(sorted_sentiments)}

    def map_func(x):
        return mapping.get(x, 1)
    
    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true)) 
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=list(mapping.values()))
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [19]:
def predict(X_test, model, tokenizer, sentiments):
    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["text"]
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer,
                        max_new_tokens = 3, 
                        temperature = 0.0,
                       )
        result = pipe(prompt, pad_token_id=pipe.tokenizer.eos_token_id)
        answer = result[0]['generated_text'].split("The correct option is")[-1].lower()
        selection=False
        for sentiment in sentiments:
            if sentiment in answer:
                y_pred.append(sentiment)
                selection=True
                break
        # if "positive" in answer:
        #     y_pred.append("positive")
        # elif "negative" in answer:
        #     y_pred.append("negative")
        # elif "neutral" in answer:
        #     y_pred.append("neutral")
        if not selection:
            y_pred.append("none")
    return y_pred

In [20]:
y_pred = predict(X_test, model, tokenizer, sentiments)

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.
100%|███████████████████████████████████████████| 80/80 [29:51<00:00, 22.40s/it]


In [23]:
evaluate(y_true, y_pred, sentiments)

Accuracy: 0.838
Accuracy for label 0: 0.950
Accuracy for label 1: 1.000
Accuracy for label 2: 1.000
Accuracy for label 3: 0.400

Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.95      0.76        20
           1       0.95      1.00      0.98        20
           2       0.95      1.00      0.98        20
           3       1.00      0.40      0.57        20

    accuracy                           0.84        80
   macro avg       0.88      0.84      0.82        80
weighted avg       0.88      0.84      0.82        80


Confusion Matrix:
[[19  1  0  0]
 [ 0 20  0  0]
 [ 0  0 20  0]
 [11  0  1  8]]
