In [None]:
import pandas as pd


#load dataset

with open('train_smhd_limited.json', 'r') as file:
    json_content = file.read()

df = pd.read_json(json_content, lines=True)

# Use explode to split the 'posts' column into separate rows
df_exploded = df.explode('posts', ignore_index=True)

df_adhd = df_exploded[df_exploded['condition'] == 'adhd']
df_control = df_exploded[df_exploded['condition'] == 'control']

control_texts = []
for post in df_control['posts']:
    control_texts.append(post['text'])

adhd_texts = []
for p in df_adhd['posts']:
    adhd_texts.append(p['text'])

adhd_createdat = []
for p in df_adhd['posts']:
    adhd_createdat.append(p['created_utc'])

control_createdat = []
for p in df_control['posts']:
    control_createdat.append(p['created_utc'])

df_adhd['text'] = adhd_texts
df_adhd['created_at'] = adhd_createdat
df_adhd.drop('posts', axis=1, inplace=True)

df_control['text'] = control_texts
df_control['created_at'] = control_createdat
df_control.drop('posts', axis=1, inplace=True)
display(df_control)


In [None]:
df_combined = pd.concat([df_adhd, df_control], ignore_index=True)
display(df_combined)

df_combined['label_int'] = df_combined['condition'].replace({'control': 0, 'adhd': 1})
display(df_combined)

Tokenize


In [None]:
import nltk
nltk.download('punkt')

df_combined['text'] = df_combined['text'].astype(str)

df_combined['tokenized_text'] = df_combined['text'].apply(nltk.word_tokenize)

# Write the DataFrame to a JSON file
df_combined.to_json('tokenized_data.json', orient='records', lines=True)

In [None]:
import pickle
with open('tokenized_data.pkl', 'rb') as file:
    tokenized_data = pickle.load(file)

df_combined = pd.DataFrame(tokenized_data)
display(df_combined)

Word2vec embedding

In [None]:
import numpy as np
from gensim.models import Word2Vec

model = Word2Vec(sentences=df_combined['tokenized_text'], vector_size=1000, window=5, min_count=1, workers=4) #train on all texts 

def get_embedding(tokens):
    valid_tokens = [token for token in tokens if token in model.wv.index_to_key]
  
    if valid_tokens:
        return np.mean([model.wv[token] for token in valid_tokens], axis=0)
    else:
        return np.zeros(model.vector_size)

df_combined['text_embedding'] = df_combined['tokenized_text'].apply(get_embedding)


display(df_combined)


Split data for training and testing

In [None]:
from sklearn.model_selection import train_test_split

df_label_0 = df_combined[df_combined['label_int'] == 0]
df_label_1 = df_combined[df_combined['label_int'] == 1]


df_label_0 = df_label_0.sample(frac=1, random_state=42)
df_label_1 = df_label_1.sample(frac=1, random_state=42)


df_80_label_0, df_20_label_0 = train_test_split(df_label_0, test_size=0.2, random_state=42)
df_80_label_1, df_20_label_1 = train_test_split(df_label_1, test_size=0.2, random_state=42)


df_train = pd.concat([df_80_label_0, df_80_label_1], ignore_index=True)
df_test = pd.concat([df_20_label_0, df_20_label_1], ignore_index=True)


df_train = df_train.sample(frac=1, random_state=42)
df_test = df_test.sample(frac=1, random_state=42)

In [None]:
y_train = df_train['label_int']
y_test = df_test['label_int']

x_train_embed = df_train['text_embedding']
x_test_embed = df_test['text_embedding']

x_train_text = df_train['text']
x_test_text = df_test['text']

Save to pickles

In [None]:
import pickle
with open('x_train_smhd_embed.pkl', 'wb') as file:
    pickle.dump(x_train_embed, file)

with open('y_train_smhd.pkl', 'wb') as file:
    pickle.dump(y_train, file)

with open('y_test_smhd.pkl', 'wb') as file:
    pickle.dump(y_test, file)

with open('x_test_smhd_embed.pkl', 'wb') as file:
    pickle.dump(x_test_embed, file)

with open('x_test_smhd_text.pkl', 'wb') as file:
    pickle.dump(x_test_text, file)

with open('x_train_smhd_text.pkl', 'wb') as file:
    pickle.dump(x_train_text, file)