## CS6474 Assignment III

### Tyra Silaphet | 4-9-2025

#### Part 1a: Model 1 - n-grams

In [5]:
# Part 1a: Analysis of Altruistic Request Success on Social Media Using SVM with N-gram Features

import json
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, roc_auc_score, confusion_matrix,
                            classification_report)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle

# Set random seed for reproducibility
random.seed(42)

def data_preprocessing_a(path):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)  # Load entire JSON array
    
    # Extract relevant fields (texts and labels)
    texts = [d['request_text'] for d in data]
    labels = [1 if d['requester_received_pizza'] else 0 for d in data]
    
    # Shuffle data to avoid any ordering effects
    texts, labels = shuffle(texts, labels, random_state=123)
    return texts, labels

def evaluation(model, X_test, y_test):
    # Generate predictions and probabilities
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]
    
    # Calculate required metrics using libraries 
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba)
    }
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    metrics["Specificity"] = tn / (tn + fp)
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return metrics

def main():
    # Call preprocessing function
    print("Loading and preprocessing data...")
    filepath = r'C:\Users\silap\Desktop\spring24\cs6474\Assignment3\data\pizza_request_dataset.json'
    texts, labels = data_preprocessing_a(filepath)
    
    # Split data into training (90%) and test (10%) sets
    X_train, X_test, y_train, y_test = train_test_split(
        texts, labels, 
        test_size=0.1, 
        random_state=42,
        stratify=labels  # Address dataset imbalance
    )
    
    print(f"Training samples: {len(X_train)}")
    print(f"Test samples: {len(X_test)}")
    print(f"Class distribution (train): {sum(y_train)/len(y_train):.2%} positive")
    
    # Text processing pipeline with SVM
    model = Pipeline([
        ('vectorizer', TfidfVectorizer(
            ngram_range=(1,2), # 1=Unigrams, 2=bigrams
            max_features=1000, # Select the top 1000 features (500 uni + 500 bi)
            sublinear_tf=True, # Dampens effects of very frequent terms
            stop_words='english',  # Remove common words
            min_df=5,  # Ignore terms appearing in fewer than 5 documents
            max_df=0.7  # Ignore terms appearing in >70% of documents
        )),
        
        # SVM classifier with:
        # - linear kernel for interpretability
        # - class_weight='balanced' to handle imbalanced data
        # - probability=True to enable ROC AUC calculation
        ('classifier', SVC(
            kernel='linear',
            class_weight='balanced',
            probability=True,
            random_state=42
        ))
    ])
    
    # 4. Train model
    print("\nTraining model...")
    model.fit(X_train, y_train)
    
    # 5. Evaluate model
    print("\nEvaluating model performance...")
    metrics = evaluation(model, X_test, y_test)
    
    # Print results in a clean table format
    print("\nPerformance Metrics:")
    print("{:<12} {:.3f}".format('Accuracy', metrics['Accuracy']))
    print("{:<12} {:.3f}".format('Precision', metrics['Precision']))
    print("{:<12} {:.3f}".format('Recall', metrics['Recall']))
    print("{:<12} {:.3f}".format('F1 Score', metrics['F1 Score']))
    print("{:<12} {:.3f}".format('Specificity', metrics['Specificity']))
    print("{:<12} {:.3f}".format('AUC', metrics['AUC']))

if __name__ == "__main__":
    main()

Loading and preprocessing data...
Training samples: 5103
Test samples: 568
Class distribution (train): 24.63% positive

Training model...

Evaluating model performance...

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.67      0.74       428
           1       0.35      0.54      0.42       140

    accuracy                           0.64       568
   macro avg       0.58      0.60      0.58       568
weighted avg       0.70      0.64      0.66       568


Performance Metrics:
Accuracy     0.637
Precision    0.347
Recall       0.536
F1 Score     0.421
Specificity  0.671
AUC          0.620


#### Part 1b: Model 2 - Activity and Reputation

In [7]:
# Part 1b: Analysis Using Activity and Reputation Features

import json
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                            f1_score, roc_auc_score, confusion_matrix,
                            classification_report)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle

# Set random seed for reproducibility
np.random.seed(42)

def feature_extraction(data):
    features = []
    
    for post in data:
        # Activity features
        activity = [
            int(post['post_was_edited']),
            float(post['requester_account_age_in_days_at_request']),
            float(post['requester_account_age_in_days_at_retrieval']),
            float(post['requester_days_since_first_post_on_raop_at_request']),
            float(post['requester_days_since_first_post_on_raop_at_retrieval']),
            int(post['requester_number_of_comments_at_request']),
            int(post['requester_number_of_comments_at_retrieval']),
            int(post['requester_number_of_comments_in_raop_at_request']),
            int(post['requester_number_of_comments_in_raop_at_retrieval']),
            int(post['requester_number_of_posts_at_request']),
            int(post['requester_number_of_posts_at_retrieval']),
            int(post['requester_number_of_posts_on_raop_at_request']),
            int(post['requester_number_of_posts_on_raop_at_retrieval']),
            int(post['requester_number_of_subreddits_at_request']),
            len(post['requester_subreddits_at_request']) if post['requester_subreddits_at_request'] != 'N/A' else 0
        ]
        
        # Reputation features
        reputation = [
            int(post['number_of_downvotes_of_request_at_retrieval']),
            int(post['number_of_upvotes_of_request_at_retrieval']),
            int(post['requester_upvotes_minus_downvotes_at_request']),
            int(post['requester_upvotes_minus_downvotes_at_retrieval']),
            int(post['requester_upvotes_plus_downvotes_at_request']),
            int(post['requester_upvotes_plus_downvotes_at_retrieval'])
        ]

        # Combine all features 
        features.append(activity + reputation)
    
    return np.array(features)

def data_preprocessing(path):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    X = feature_extraction(data)
    y = np.array([1 if post['requester_received_pizza'] else 0 for post in data])
    
    return X, y

def evaluation(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]
    
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba)
    }
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    metrics["Specificity"] = tn / (tn + fp)
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return metrics

def main():
    # Call preprocessing function
    print("Loading and preprocessing data...")
    filepath = r'C:\Users\silap\Desktop\spring24\cs6474\Assignment3\data\pizza_request_dataset.json'
    X, y = data_preprocessing(filepath)
    
    # Split data (90% train, 10% test)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=42, stratify=y)
    
    print(f"Training samples: {len(X_train)}")
    print(f"Test samples: {len(X_test)}")
    print(f"Class distribution (train): {sum(y_train)/len(y_train):.2%} positive")
    
    # Create pipeline with standardization and SVM
    model = Pipeline([
        ('scaler', StandardScaler()),  
        ('classifier', SVC(
            kernel='linear',
            class_weight='balanced',
            probability=True,
            random_state=42
        ))
    ])
    
    # Train model
    print("\nTraining model...")
    model.fit(X_train, y_train)
    
    # Evaluate model
    print("\nEvaluating model performance...")
    metrics = evaluation(model, X_test, y_test)
    
    # Print results
    print("\nPerformance Metrics:")
    print("{:<12} {:.3f}".format('Accuracy', metrics['Accuracy']))
    print("{:<12} {:.3f}".format('Precision', metrics['Precision']))
    print("{:<12} {:.3f}".format('Recall', metrics['Recall']))
    print("{:<12} {:.3f}".format('F1 Score', metrics['F1 Score']))
    print("{:<12} {:.3f}".format('Specificity', metrics['Specificity']))
    print("{:<12} {:.3f}".format('AUC', metrics['AUC']))

if __name__ == "__main__":
    main()

Loading and preprocessing data...
Training samples: 5103
Test samples: 568
Class distribution (train): 24.63% positive

Training model...

Evaluating model performance...

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.88      0.87       428
           1       0.60      0.56      0.58       140

    accuracy                           0.80       568
   macro avg       0.73      0.72      0.73       568
weighted avg       0.80      0.80      0.80       568


Performance Metrics:
Accuracy     0.801
Precision    0.603
Recall       0.564
F1 Score     0.583
Specificity  0.879
AUC          0.839


#### Part 1c: Model 3 - Narratives

In [8]:
# Part 1c: Analysis Using Narrative Dimensions Features

import json
import re
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                           f1_score, roc_auc_score, confusion_matrix,
                           classification_report)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

np.random.seed(42)

def load_narrative_words(narrative_paths):
    narratives = {}
    for name, path in narrative_paths.items():
        with open(path, 'r') as f:
            words = [line.strip() for line in f if line.strip()]
            narratives[name] = words
    return narratives

def narrative_feature_extraction(texts, narrative_words):
    features = []
    for text in texts:
        words = re.findall(r'\w+', text.lower())
        total_words = max(len(words), 1)  # Avoid division by zero
        
        narrative_counts = []
        for name, word_list in narrative_words.items():
            matches = sum(1 for word in words if word in word_list)
            narrative_counts.append(matches / total_words)
        
        features.append(narrative_counts)
    return np.array(features)

def data_preprocessing(path, narrative_paths):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    texts = [d['request_text'] for d in data]
    labels = [1 if d['requester_received_pizza'] else 0 for d in data]
    
    # Load narrative words
    narrative_words = load_narrative_words(narrative_paths)
    
    # Extract features
    X = narrative_feature_extraction(texts, narrative_words)
    y = np.array(labels)
    
    return X, y

def main():
    # Define paths to narrative files
    narrative_paths = {
        'desire': r'C:\Users\silap\Desktop\spring24\cs6474\Assignment3\resources\narratives\desire.txt',
        'family': r'C:\Users\silap\Desktop\spring24\cs6474\Assignment3\resources\narratives\family.txt',
        'job': r'C:\Users\silap\Desktop\spring24\cs6474\Assignment3\resources\narratives\job.txt',
        'money': r'C:\Users\silap\Desktop\spring24\cs6474\Assignment3\resources\narratives\money.txt',
        'student': r'C:\Users\silap\Desktop\spring24\cs6474\Assignment3\resources\narratives\student.txt'
    }
    
    # Call data preprocessing function
    print("Loading and preprocessing data...")
    json_path = r'C:\Users\silap\Desktop\spring24\cs6474\Assignment3\data\pizza_request_dataset.json'
    X, y = data_preprocessing(json_path, narrative_paths)
    
    # Split data (90% train, 10% test)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=42, stratify=y)
    
    print(f"Training samples: {len(X_train)}")
    print(f"Test samples: {len(X_test)}")
    print(f"Class distribution (train): {sum(y_train)/len(y_train):.2%} positive")
    
    # Create pipeline with standardization and SVM
    model = Pipeline([
        ('scaler', StandardScaler()), 
        ('svm', SVC(
            kernel='linear',
            class_weight='balanced',
            probability=True,
            random_state=42
        ))
    ])
    
    # Train model
    print("\nTraining model...")
    model.fit(X_train, y_train)
    
    # Evaluate model
    print("\nEvaluating model performance...")
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]
    
    # Calculate metrics
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba)
    }
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    metrics["Specificity"] = tn / (tn + fp)
    
    # Print results
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    print("\nPerformance Metrics:")
    print("{:<12} {:.3f}".format('Accuracy', metrics['Accuracy']))
    print("{:<12} {:.3f}".format('Precision', metrics['Precision']))
    print("{:<12} {:.3f}".format('Recall', metrics['Recall']))
    print("{:<12} {:.3f}".format('F1 Score', metrics['F1 Score']))
    print("{:<12} {:.3f}".format('Specificity', metrics['Specificity']))
    print("{:<12} {:.3f}".format('AUC', metrics['AUC']))

if __name__ == "__main__":
    main()

Loading and preprocessing data...
Training samples: 5103
Test samples: 568
Class distribution (train): 24.63% positive

Training model...

Evaluating model performance...

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.70      0.72       428
           1       0.22      0.26      0.24       140

    accuracy                           0.59       568
   macro avg       0.48      0.48      0.48       568
weighted avg       0.61      0.59      0.60       568


Performance Metrics:
Accuracy     0.590
Precision    0.218
Recall       0.257
F1 Score     0.236
Specificity  0.699
AUC          0.495


#### Part 1d: Model 4 - Moral Foundations

In [None]:
# Part 1d: Analysis Using Moral Foundations Features

import json
import re
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                           f1_score, roc_auc_score, confusion_matrix,
                           classification_report)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle

np.random.seed(42)

def load_moral_dictionary(dict_path):
    foundation_map = {
        '01': 'care',
        '02': 'harm',
        '03': 'fairness',
        '04': 'unfairness',
        '05': 'loyalty',
        '06': 'betrayal',
        '07': 'authority',
        '08': 'subversion',
        '09': 'purity',  # This was previously called 'sanctity' in your code
        '10': 'degradation',
        '11': 'morality_general'
    }
    
    foundations = {v: [] for v in foundation_map.values()}
    
    with open(dict_path, 'r') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith('%'):
                continue
                
            parts = line.split()
            if len(parts) < 2:
                continue
                
            word = parts[0].replace('*', '')
            codes = parts[1:]
            
            for code in codes:
                if code in foundation_map:
                    foundation = foundation_map[code]
                    if word not in foundations[foundation]:  # Avoid duplicates
                        foundations[foundation].append(word)
    
    # Verify we loaded words for all required foundations
    required_foundations = ['care', 'fairness', 'loyalty', 'authority', 'purity']
    for foundation in required_foundations:
        if not foundations.get(foundation):
            print(f"Warning: No words loaded for foundation {foundation}")
    
    return {k: v for k, v in foundations.items() if k in required_foundations}

def moral_feature_extraction(texts, foundations):
    features = []
    
    for text in texts:
        words = re.findall(r'\w+', text.lower())
        total_words = max(len(words), 1)  # Avoid division by zero
        
        foundation_counts = []
        # Use consistent foundation order
        for foundation in ['care', 'fairness', 'loyalty', 'authority', 'purity']:
            matches = sum(1 for word in words if word in foundations[foundation])
            foundation_counts.append(matches / total_words)
        
        features.append(foundation_counts)
    
    return np.array(features)

def data_preprocessing(json_path, dict_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    texts = [d['request_text'] for d in data]
    labels = [1 if d['requester_received_pizza'] else 0 for d in data]
    
    # Load moral foundations dictionary
    foundations = load_moral_dictionary(dict_path)
    
    # Extract features
    X = moral_feature_extraction(texts, foundations)
    y = np.array(labels)
    
    return X, y

def main():
    # Define paths
    json_path = r'C:\Users\silap\Desktop\spring24\cs6474\Assignment3\data\pizza_request_dataset.json'
    dict_path = r'C:\Users\silap\Desktop\spring24\cs6474\Assignment3\resources\MoralFoundations.dic'
    
    # Call data preprocessing function
    print("Loading and preprocessing data...")
    X, y = data_preprocessing(json_path, dict_path)
    
    # Split data (90% train, 10% test)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=42, stratify=y)
    
    print(f"Training samples: {len(X_train)}")
    print(f"Test samples: {len(X_test)}")
    print(f"Class distribution (train): {sum(y_train)/len(y_train):.2%} positive")
    
    # Create pipeline with standardization and SVM
    model = Pipeline([
        ('scaler', StandardScaler()),  
        ('svm', SVC(
            kernel='linear',
            class_weight='balanced',
            probability=True,
            random_state=42
        ))
    ])
    
    # Train model
    print("\nTraining model...")
    model.fit(X_train, y_train)
    
    # Evaluate model
    print("\nEvaluating model performance...")
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]
    
    # Calculate metrics
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba)
    }
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    metrics["Specificity"] = tn / (tn + fp)
    
    # Print results
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    print("\nPerformance Metrics:")
    print("{:<12} {:.3f}".format('Accuracy', metrics['Accuracy']))
    print("{:<12} {:.3f}".format('Precision', metrics['Precision']))
    print("{:<12} {:.3f}".format('Recall', metrics['Recall']))
    print("{:<12} {:.3f}".format('F1 Score', metrics['F1 Score']))
    print("{:<12} {:.3f}".format('Specificity', metrics['Specificity']))
    print("{:<12} {:.3f}".format('AUC', metrics['AUC']))

if __name__ == "__main__":
    main()

Loading and preprocessing data...
Training samples: 5103
Test samples: 568
Class distribution (train): 24.63% positive

Training model...

Evaluating model performance...

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.96      0.84       428
           1       0.24      0.04      0.07       140

    accuracy                           0.73       568
   macro avg       0.50      0.50      0.46       568
weighted avg       0.63      0.73      0.65       568


Performance Metrics:
Accuracy     0.731
Precision    0.240
Recall       0.043
F1 Score     0.073
Specificity  0.956
AUC          0.464
