Reading the excel file and printing

In [18]:
import pandas as pd

df = pd.read_excel("ADHD2012.xlsx")

Preprocessing

In [19]:
df = df.drop_duplicates()
df_filtered = df[~(df['label'] == 'x')]
df_filtered = df[~(df['label'] == 'X')]

df_anonymized = df[['title','selftext','label']]
df_anonymized = df_anonymized.dropna(subset=['title', 'selftext'])
df_anonymized = df_anonymized[(df_anonymized['title'] != '') & (df_anonymized['selftext'] != '')]



In [20]:
import pandas as pd
import re

def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    # Remove usernames
    text = re.sub(r'@\w+', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    return text

df_anonymized['title'] = df_anonymized['title'].apply(clean_text)
df_anonymized['selftext'] = df_anonymized['selftext'].apply(clean_text)

Tokenization

In [21]:
import nltk

df_anonymized['title'] = df_anonymized['title'].astype(str)
df_anonymized['selftext'] = df_anonymized['selftext'].astype(str)
df_anonymized['label'] = df_anonymized['label'].str.lower()


df_combined= pd.DataFrame({
    'id': range(1, len(df_anonymized) + 1),
    'combined_text': df_anonymized['title'] + df_anonymized['selftext'],
    'label': df_anonymized['label']
})

# Tokenize the 'Text' column
df_combined['tokenized_text'] = df_combined['combined_text'].apply(nltk.word_tokenize)

[nltk_data] Error loading /Users/varasheim/Desktop/masterny/masterdata
[nltk_data]     /activelearning/modAL/embeddings/tokenizers/punkt:
[nltk_data]     Package '/Users/varasheim/Desktop/masterny/masterdata/
[nltk_data]     activelearning/modAL/embeddings/tokenizers/punkt' not
[nltk_data]     found in index
[nltk_data] Error loading /Users/hildemikaelsen/Desktop/masterdata/mas
[nltk_data]     terdata/activelearning/modAL/embeddings/tokenizers/pun
[nltk_data]     kt: Package '/Users/hildemikaelsen/Desktop/masterdata/
[nltk_data]     masterdata/activelearning/modAL/embeddings/tokenizers/
[nltk_data]     punkt' not found in index


Map labels to integers

In [22]:
label_mapping = {'none': 0, 'self-diagnosis': 1, 'self-medication': 2}
df_combined['label'] = df_combined['label'].map(label_mapping)

Word2Vec embedding

In [23]:
import numpy as np
from gensim.models import Word2Vec

model = Word2Vec(sentences=df_combined['tokenized_text'], vector_size=1000, window=5, min_count=1, workers=4) #train on all texts 

def get_embedding(tokens):
    valid_tokens = [token for token in tokens if token in model.wv.index_to_key]
    
    if valid_tokens:
        return np.mean([model.wv[token] for token in valid_tokens], axis=0)
    else:
        return np.zeros(model.vector_size)

# Apply the function to create title embeddings for all titles in the 'tokenized_title' column
df_combined['text_embedding'] = df_combined['tokenized_text'].apply(get_embedding)
print(df_combined[['id', 'combined_text', 'text_embedding']])


          id                                      combined_text  \
0          1  Android app to strengthen attention/focusHey /...   
1          2  Does anyone here have experience with Imiprami...   
3          3  What does the ADHD test look like?I'm 21 and d...   
4          4  Are you guys good with maps and directions?It ...   
5          5  Just started Concerta today any advice/experie...   
...      ...                                                ...   
17841  17841  Where are my programmers/coders/developers wit...   
17842  17842  Chemically, what is ADHD?I heard its just a la...   
17843  17843  Modeling sensitization to stimulants in humans...   
17844  17844  How to know when my meds have worn off. AKA: I...   
17845  17845  Anyone here taken the Fundamentals of Engineer...   

                                          text_embedding  
0      [0.055784408, 0.0014394955, 0.0440787, 0.12509...  
1      [-0.1229137, 0.22171226, -0.054751925, 0.02254...  
3      [-0.1288491

Split into labeled and unlabeled dataframes

In [24]:
embeddings_labeled = df_combined[~df_combined['label'].isna()]
embeddings_unlabeled = df_combined[df_combined['label'].isna()]

labeled_array = np.array(embeddings_labeled[['label', 'text_embedding']].to_numpy())
unlabeled_array = np.array(embeddings_unlabeled[['label', 'text_embedding']].to_numpy())

X_pool = np.array([item[1] for item in unlabeled_array])
Y_pool = np.array([item[0] for item in unlabeled_array])


remove_these = np.random.choice(labeled_array.shape[0], 200, replace=False)
evaluation_data = labeled_array[remove_these]
labeled_new = np.delete(labeled_array, remove_these, axis=0)

X_training = np.array([item[1] for item in labeled_new])
y_training = np.array([item[0] for item in labeled_new])

evaluation_x= np.array([item[1] for item in evaluation_data])
evaluation_y= np.array([item[0] for item in evaluation_data])


In [25]:
#make arrays for transformer models
labeled_text_array = np.array(embeddings_labeled[['label', 'combined_text']].to_numpy())
unlabeled_text_array = np.array(embeddings_unlabeled[['label', 'combined_text']].to_numpy())

X_pool_text = np.array([item[1] for item in unlabeled_text_array])
Y_pool_text = np.array([item[0] for item in unlabeled_text_array])


remove_these_text = np.random.choice(labeled_text_array.shape[0], 200, replace=False)
evaluation_data_text = labeled_text_array[remove_these_text]
labeled_new_text = np.delete(labeled_text_array, remove_these_text, axis=0)

X_training_text = np.array([item[1] for item in labeled_new_text])
y_training_text = np.array([item[0] for item in labeled_new_text])

evaluation_x_text= np.array([item[1] for item in evaluation_data_text])
evaluation_y_text= np.array([item[0] for item in evaluation_data_text])



Save preprocessed arrays with combined text to use for transformer models in pickle files

In [26]:
import pickle
with open('x_pool_text.pkl', 'wb') as file:
    pickle.dump(X_pool_text, file)

with open('y_pool_text.pkl', 'wb') as file:
    pickle.dump(Y_pool_text, file)

with open('unlabeled_text_array.pkl', 'wb') as file:
    pickle.dump(unlabeled_text_array, file)

with open('labeled_text_array.pkl', 'wb') as file:
    pickle.dump(labeled_text_array, file)

with open('X_training_text.pkl', 'wb') as file:
    pickle.dump(X_training_text, file)

with open('y_training_text.pkl', 'wb') as file:
    pickle.dump(y_training_text, file)

with open('evaluation_x_text.pkl', 'wb') as file:
    pickle.dump(evaluation_x_text, file)

with open('evaluation_y_text.pkl', 'wb') as file:
    pickle.dump(evaluation_y_text, file)


Save preprocessed arrays with Word2Vec embeddings to use for traditional and deep learning models in pickle files

In [27]:
import pickle
with open('x_pool_w2v.pkl', 'wb') as file:
    pickle.dump(X_pool, file)

with open('y_pool_w2v.pkl', 'wb') as file:
    pickle.dump(Y_pool, file)

with open('embeddings_unlabeled_w2v.pkl', 'wb') as file:
    pickle.dump(embeddings_unlabeled, file)

with open('X_training_unbalanced_w2v.pkl', 'wb') as file:
    pickle.dump(X_training, file)

with open('y_training_unbalanced_w2v.pkl', 'wb') as file:
    pickle.dump(y_training, file)

with open('evaluation_x_unbalanced_w2v.pkl', 'wb') as file:
    pickle.dump(evaluation_x, file)

with open('evaluation_y_unbalanced_w2v.pkl', 'wb') as file:
    pickle.dump(evaluation_y, file)


Test predict

In [28]:
X_test = np.array(embeddings_unlabeled['text_embedding'].tolist())
predict_test = X_test[:300]