In [None]:
import os
import re
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from joblib import dump
import tweepy
from joblib import load
import shap
import numpy as np

In [None]:
# Load model and vectorizer
vectorizer = load("tfidf_vectorizer.joblib")
model = load("logistic_model.joblib")

# Define a prediction function that accepts raw text
def model_predict(texts):
    X = vectorizer.transform(texts)
    return model.predict_proba(X)

# SHAP explainer using callable function
explainer = shap.Explainer(model_predict, vectorizer.transform)

In [None]:

def explain_prediction_with_shap(text, user_id=None, show_plot=True):
    """
    Predicts depression from user text and displays SHAP word contribution plot.
    """
    pred_proba = model_predict([text])[0][1]
    pred_label = int(pred_proba >= 0.5)

    print("="*50)
    print(f"User: {user_id or 'Unknown'}")
    print(f"Prediction: {'Depressed' if pred_label else 'Not Depressed'}")
    print(f"Confidence: {pred_proba:.2f}")
    print("="*50)

    shap_values = explainer([text])
    
    if show_plot:
        shap.plots.text(shap_values[0])

In [None]:
def extract_texts_from_chunk(file_path):
    """Extracts all <TEXT>...</TEXT> sections from an individual chunk file."""
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read()
    texts = re.findall(r'<TEXT>(.*?)</TEXT>', content, re.DOTALL)
    texts = [text.strip().replace('\n', ' ') for text in texts]
    return texts

In [None]:
def load_erisk_chunked_data(user_data_path, truth_file_path, max_chunks=10, max_posts_per_chunk=None):
    """Loads user chunks and corresponding labels."""
    labels = {}
    with open(truth_file_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) == 2:
                user_id_base, label = parts
                labels[user_id_base] = int(label)

    user_chunks = defaultdict(list)

    for filename in os.listdir(user_data_path):
        if filename.startswith("test_subject"):
            parts = filename.split("_")
            if len(parts) != 3:
                continue

            user_id_base = "_".join(parts[:2])  # e.g., test_subject25
            chunk_idx = int(parts[2].split(".")[0])  # chunk number (1 to 10)

            full_path = os.path.join(user_data_path, filename)
            posts = extract_texts_from_chunk(full_path)

            if max_posts_per_chunk:
                posts = posts[:max_posts_per_chunk]

            chunk_text = " ".join(posts)
            user_chunks[user_id_base].append((chunk_idx, chunk_text))
    
    data = []
    for user_id_base, chunks in user_chunks.items():
        sorted_chunks = sorted(chunks, key=lambda x: x[0])
        sorted_chunks = sorted_chunks[:max_chunks]
        chunk_texts = [text for idx, text in sorted_chunks]

        label = labels.get(user_id_base, None)
        if label is not None:
            data.append({
                "user_id": user_id_base,
                "chunks": chunk_texts,
                "label": label
            })

    return pd.DataFrame(data)

In [None]:
def progressive_training(train_df, test_df, chunk_stages=[2, 4, 6, 8, 10]):
    """Train and evaluate progressively as more chunks are revealed."""
    results = {}

    for n_chunks in chunk_stages:
        print(f"==== Training with first {n_chunks} chunks ====")

        # Prepare data
        X_train = train_df['chunks'].apply(lambda chunks: " ".join(chunks[:n_chunks]))
        y_train = train_df['label']

        X_test = test_df['chunks'].apply(lambda chunks: " ".join(chunks[:n_chunks]))
        y_test = test_df['label']

        # TF-IDF
        vectorizer = TfidfVectorizer(max_features=5000)
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
        dump(vectorizer, "tfidf_vectorizer.joblib")
        
        # Train
        model = LogisticRegression(max_iter=1000, class_weight='balanced')
        model.fit(X_train_tfidf, y_train)
        dump(model, "logistic_model.joblib")
        # Predict
        y_pred = model.predict(X_test_tfidf)

        # Metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        results[n_chunks] = {
            'accuracy': acc,
            'precision': prec,
            'recall': rec,
            'f1': f1
        }

    return results

In [None]:
# 📍 Set paths
train_data_path = r'C:\Users\SYYAD\Documents\MSAI\AI in Healthcare\HRP\eRisk2017\2017\test\user_data'
train_truth_path = r'C:\Users\SYYAD\Documents\MSAI\AI in Healthcare\HRP\eRisk2017\2017\test\test_golden_truth.txt'

# Load data
train_df = load_erisk_chunked_data(train_data_path, train_truth_path)

In [None]:
# Split into Train and Test
def split_train_test(df, test_size=0.2, random_state=42):
    shuffled_df = df.sample(frac=1, random_state=random_state)
    split_idx = int(len(shuffled_df) * (1 - test_size))
    train = shuffled_df.iloc[:split_idx]
    test = shuffled_df.iloc[split_idx:]
    return train, test

train_users, test_users = split_train_test(train_df)

# Train and Evaluate
results = progressive_training(train_users, test_users)

# 📋 Print results nicely
print("Chunks | Accuracy | Precision | Recall | F1")
print("----------------------------------------------")
for n_chunks, metrics in sorted(results.items()):
    print(f"{n_chunks:>6} | {metrics['accuracy']:.3f}   | {metrics['precision']:.3f}    | {metrics['recall']:.3f}  | {metrics['f1']:.3f}")

In [None]:
chunk_numbers = []
accuracies = []
precisions = []
recalls = []
f1s = []

for n_chunks, metrics in results.items():
    chunk_numbers.append(n_chunks)
    accuracies.append(metrics['accuracy'])
    precisions.append(metrics['precision'])
    recalls.append(metrics['recall'])
    f1s.append(metrics['f1'])

plt.figure(figsize=(10, 6))
plt.plot(chunk_numbers, accuracies, marker='o', label='Accuracy')
plt.plot(chunk_numbers, precisions, marker='o', label='Precision')
plt.plot(chunk_numbers, recalls, marker='o', label='Recall')
plt.plot(chunk_numbers, f1s, marker='o', label='F1 Score')

plt.title('Progressive Depression Detection Performance')
plt.xlabel('Number of Chunks Seen')
plt.ylabel('Score')
plt.grid(True)
plt.legend()
plt.show()

In [None]:
# Set up Twitter API keys and tokens
bearer_token = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" # Replace with your actual bearer token

client = tweepy.Client(bearer_token=bearer_token)

def get_last_10_tweets(username):
    user = client.get_user(username=username)
    tweets = client.get_users_tweets(user.data.id, max_results=5, exclude=["replies", "retweets"])
    return [tweet.text for tweet in tweets.data]

In [None]:
# Load model and vectorizer
vectorizer = load("tfidf_vectorizer.joblib")
model = load("logistic_model.joblib")

# Clean function
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove URLs
    text = re.sub(r'\@\w+|\#','', text)  # remove mentions and hashtags
    return text.lower()

# Model prediction function that accepts raw text and returns probabilities
def predict_proba(texts):
    cleaned = [clean_text(t) for t in texts]
    features = vectorizer.transform(cleaned)
    return model.predict_proba(features)

# Create a SHAP explainer for raw text input
text_explainer = shap.Explainer(predict_proba, shap.maskers.Text(r"\W+"))

# Full prediction and explanation function
def predict_depression_with_explanation(tweets, show_plot=True):
    combined = " ".join([clean_text(t) for t in tweets])
    pred = model.predict(vectorizer.transform([combined]))[0]
    label = "Depressed" if pred else "Not Depressed"
    prob = model.predict_proba(vectorizer.transform([combined]))[0][1]

    print(f"Prediction: {label}")
    print(f"Confidence: {prob:.2f}")

    if show_plot:
        shap_values = text_explainer([combined])
        shap.plots.text(shap_values[0])

    return label


In [1]:
tweets = get_last_10_tweets("thisusertwtss")
print(f"Last 5 tweets from elonmusk: {tweets}")

NameError: name 'get_last_10_tweets' is not defined

In [None]:
status = predict_depression_with_explanation(tweets)
print(f"Predicted status: {status}")

In [2]:
tweets = get_last_10_tweets("thisusertwtss")
print(f"Last 5 tweets from thisusertwtss: {tweets}")

NameError: name 'get_last_10_tweets' is not defined

In [None]:
status = predict_depression_with_explanation(tweets)
print(f"Predicted status: {status}")