In [51]:
import pandas as pd
from glob import glob
import numpy as np
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Dialog Act Mapping: 12 → 3 classes
dialog_act_map = {
    'CRQ': 'Speaker Initiative',
    'ORQ': 'Speaker Initiative',
    'IRQ': 'Speaker Initiative',
    'YNQ': 'Speaker Initiative',

    'GT': 'General',
    'GC': 'General',
    'ACK': 'General',
    
    'ID': 'Speaker Responsive',
    'CD': 'Speaker Responsive',
    'PA': 'Speaker Responsive',
    'OD': 'Speaker Responsive',
    'NA': 'Speaker Responsive'
}

# Get all csv files in current directory
csv_files = glob("*.csv")

# Store all matched (context, label) pairs
all_pairs = []

# Process each file
for file in csv_files:
    df = pd.read_csv(file)
    df.columns = ['ID', 'Type', 'Utterance', 'Dialog_Act']
    
    for i in range(len(df)):
        if str(df.loc[i, 'Type']).strip().lower() == 'p':
            # Search next therapist
            for j in range(i+1, len(df)):
                if str(df.loc[j, 'Type']).strip().lower() == 't':
                    context = df.loc[i, 'Utterance']
                    raw_label = df.loc[j, 'Dialog_Act']
                    clean_label = str(raw_label).strip().upper()
                    mapped_label = dialog_act_map.get(clean_label)
                    
                    if mapped_label:
                        all_pairs.append((context, mapped_label))
                    break

# Convert to DataFrame
final_df = pd.DataFrame(all_pairs, columns=['context', 'label'])

# Display stats
print(f"Total valid (patient → therapist) examples: {len(final_df)}")
print("\n Label Distribution:")
print(final_df['label'].value_counts())

print(final_df.head(5))
# Save to CSV
final_df.to_csv("dialog_acts_3class_dataset.csv", index=False)



Total valid (patient → therapist) examples: 237

 Label Distribution:
Speaker Initiative    179
General                46
Speaker Responsive     12
Name: label, dtype: int64
                                             context               label
0                     I do well, thanks for asking.   Speaker Responsive
1  Yeah, I'm, I'm recently married. And so my hus...  Speaker Initiative
2  Yes. Um, I participated in a sleep study and w...  Speaker Initiative
3  Colby? Um, well, I'm just little worried that ...  Speaker Initiative
4  Usually I wake up and he's like, what are you ...  Speaker Initiative


In [52]:
# Load the dataset
df = pd.read_csv("dialog_acts_3class_dataset.csv")
print(f"Total original samples: {len(df)}")

# Data cleaning
df = df.drop_duplicates()  # Remove duplicates
df = df[df['context'].notnull()]  # Remove null values
df = df[df['context'].str.strip() != '']  # Remove empty strings
df = df[df['context'].str.len() > 5]  # Remove too-short text entries

# Keep only the three valid target classes
valid_labels = ['Speaker Initiative', 'General', 'Speaker Responsive']
df = df[df['label'].isin(valid_labels)]

print(f"Cleaned sample count: {len(df)}")

print("\nClass distribution:")
print(df['label'].value_counts())


Total original samples: 237
Cleaned sample count: 196

Class distribution:
Speaker Initiative    149
General                36
Speaker Responsive     11
Name: label, dtype: int64


In [53]:
# Handle class imbalance via upsampling
df_major = df[df.label == 'Speaker Initiative']
df_mid = df[df.label == 'General']
df_minor = df[df.label == 'Speaker Responsive']

df_mid_up = resample(df_mid, replace=True, n_samples=100, random_state=42)
df_minor_up = resample(df_minor, replace=True, n_samples=100, random_state=42)

df_balanced = pd.concat([df_major, df_mid_up, df_minor_up])
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print("Class distribution after upsampling:")
print(df_balanced['label'].value_counts())


Class distribution after upsampling:
Speaker Initiative    149
General               100
Speaker Responsive    100
Name: label, dtype: int64


In [54]:
# Split features and labels
X = df_balanced['context']
y = df_balanced['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Build model pipeline: TF-IDF + Logistic Regression
model = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=2)),
    ('clf', LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs'))
])

# Train the model
model.fit(X_train, y_train)

# Evaluate model performance
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
                    precision    recall  f1-score   support

           General       0.95      0.90      0.92        20
Speaker Initiative       0.93      0.93      0.93        30
Speaker Responsive       0.95      1.00      0.98        20

          accuracy                           0.94        70
         macro avg       0.94      0.94      0.94        70
      weighted avg       0.94      0.94      0.94        70



In [55]:
# Save the model
joblib.dump(model, 'model.pkl')
print("Model saved as model.pkl")

Model saved as model.pkl
