In [None]:
!pip install openai

In [None]:
import getpass
api_key = getpass.getpass()

··········


In [None]:
import pandas as pd

df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vRc1EkKgndtFkwtypr9d6W6qzw2LIX4eRnNbJYYSIG7cjXRgsuTto4Q5HXaeFj-jAn36RYpFwDOVF_N/pub?gid=1495258861&single=true&output=csv')

# create an id which is the index + 1
df['id'] = df.index + 1
df.head()

Unnamed: 0,name,age,content,len,id
0,Michelle,13,I: Let's begin with ‚how I see myself'. Can yo...,4125,1
1,Sapan,13,I: You've just done one collage yes [how I see...,3847,2
2,Jason,13,"I: So Jason, are these two separate collages o...",8701,3
3,Malcolm,13,"I: OK Malcolm, are these two separate collages...",6486,4
4,Joe,14,I: So let's start with ‚how I see myself'. Do\...,5451,5


In [None]:
# return a batch of rows labelled as samples
def prepare_content(batch):
    return "".join([f"## Sample {row['id']}\n---\n{row['content']}\n---\n\n" for _, row in batch.iterrows()])

# display 10 samples
print(prepare_content(df.iloc[:10]))

## Sample 1
---
I: Let's begin with ‚how I see myself'. Can you just go through the pictures that you've 
used and explain what pictures you'
ve used and why you've used them? 
M: Yeah. I've used this woman [Kelly Ro
wland ] b ecau se sh e's quite fashionable and I 
see myself as a fashionable person. I' ve used a picture of a Liverpool football team 

because I see myself as quite a supportive pers
on towards the team. I put a love heart 
because I'm quite a loving person. I also 
put a laptop because I like to explore things 
and try new things. 
I: What about the pictures of Kat, Alfie [
EastEnders
 characters] and, is that Lemar? 
M: Bow Wow. 
I: Let's start with Kat and Alfie, why have you used them? 

M: Because Kat, erm Alfie's quite cheery and he makes people smile so metimes so I just 
like Alfie, and I also have a temper to so I put Kat. 
I: And what about Bow Wow? 
M: I put him because I like him (laughs). 
I: So if you were going to sum up how you see yourself, what words w

In [None]:
from openai import AsyncOpenAI as OpenAI
client = OpenAI(api_key=api_key)

system_prompt = """Analyze the given text samples (## Sample 1, ## Sample 2, etc) for thematic labels based on similarities and differences.
The response format is JSONL.

**Format:**
```jsonl
{"name": "label name", "samples": ["1", "2", ...]},
{"name": "another label name", "samples": ["2", "3", ...]},
{"name": "unique label name", "samples": ["1"]},
{"name": "descriptive label name", "samples": ["1", "2", ...]},
{"name": "another important label", "samples": ["2", "3", ...]},
{"name": "a interesting unique label", "samples": ["1"]},
// ... more labels as identified
```"""


async def call_openai(batch):
    try:
        documents = prepare_content(batch)
        response = await client.chat.completions.create(
            model="gpt-4o-2024-05-13", # gpt-4-turbo-2024-04-09
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": documents}
            ],
            temperature=1,
            max_tokens=1000,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Failed to get completion: {e}")
        return None

response = await call_openai(df.iloc[:10])
print(response)

```jsonl
{"name": "Self-perception", "samples": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]},
{"name": "Perception by others", "samples": ["1", "2", "3", "4", "5", "6", "7", "9", "10"]},
{"name": "Cultural/ethnic identity", "samples": ["2", "4", "6", "8", "10"]},
{"name": "Anger/temper", "samples": ["1", "3", "4", "7"]},
{"name": "Hobbies/interests", "samples": ["1", "2", "3", "4", "5", "7", "8", "9", "10"]},
{"name": "Fashion/style", "samples": ["1", "4"]},
{"name": "Politics", "samples": ["4", "5"]},
{"name": "Family importance", "samples": ["6", "7", "8"]},
{"name": "Sports", "samples": ["1", "2", "3", "8", "10"]}
```


In [None]:
import json

def parse_labels(response):
    if response:
        # Load labels from JSON
        label_lines = response.strip().split('\n')[1:-1]

        labels_list = []
        for line in label_lines:
            label_dict = json.loads(line.replace("},", "}"))
            labels_list.append(label_dict)
    else:
        labels_list = []
    return labels_list

parse_labels(response)

[{'name': 'Self-perception',
  'samples': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']},
 {'name': 'Perception by others',
  'samples': ['1', '2', '3', '4', '5', '6', '7', '9', '10']},
 {'name': 'Cultural/ethnic identity', 'samples': ['2', '4', '6', '8', '10']},
 {'name': 'Anger/temper', 'samples': ['1', '3', '4', '7']},
 {'name': 'Hobbies/interests',
  'samples': ['1', '2', '3', '4', '5', '7', '8', '9', '10']},
 {'name': 'Fashion/style', 'samples': ['1', '4']},
 {'name': 'Politics', 'samples': ['4', '5']},
 {'name': 'Family importance', 'samples': ['6', '7', '8']},
 {'name': 'Sports', 'samples': ['1', '2', '3', '8', '10']}]

In [None]:
import asyncio
import nest_asyncio

nest_asyncio.apply()

def split_dataframe(df, batch_size):
    # shuffle the dataframe
    df = df.sample(frac=1).reset_index(drop=True)
    return [df.iloc[i:i + batch_size] for i in range(0, len(df), batch_size)]

batches = split_dataframe(df, 10)

tasks = [asyncio.create_task(call_openai(batch)) for batch in batches]

results = await asyncio.gather(*tasks)

In [None]:
labels_list = []

for response in results:
    labels = parse_labels(response)
    for label in labels:
        labels_list.append(label['name'])

labels_list = ", ".join(labels_list)
labels_list

"animal lovers, sports enthusiasts, religious identity, celebrity admiration, self-description, ethnic identity, musical preferences, fashion and appearance, self vs. perception, cultural heritage, unique perspectives, self-perception vs. others' perception, music influence, sports interest, ethnicity and cultural identity, political beliefs, media representation, fashion and appearance, personal interests and hobbies, family and upbringing, regional identity, self-awareness and introspection, social roles and relationships, media and celebrity influence, perception of violence, academic performance, personal values and beliefs, gender equality, positive role models, negative stereotypes, importance of religion, humor and personality, discovering identity, family influences, sports interests, ethnicity and nationality, self-perception versus others' perception, celebrities and role models, emotions and mental state, personal passions and hobbies, conflict and defensive behavior, music 

In [None]:
async def call_openai(batch, labels=None):
    try:
        documents = prepare_content(batch)
        if labels:
            labels_partial = "YOU MUST ONLY APPLY THE FOLLOWING LABELS:\n"
            labels_partial += labels_list
            documents = labels_partial + documents

        response = await client.chat.completions.create(
            model="gpt-4o-2024-05-13", # gpt-4-turbo-2024-04-09
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": documents}
            ],
            temperature=1,
            max_tokens=1000,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Failed to get completion: {e}")
        return None


In [None]:
batches = split_dataframe(df, 10)

tasks = [asyncio.create_task(call_openai(batch, labels_list)) for batch in batches]

results = await asyncio.gather(*tasks)

all_labels = []
for response in results:
    labels = parse_labels(response)
    all_labels.extend(labels)

all_labels

[{'name': 'self vs. perception',
  'samples': ['33', '6', '5', '11', '21', '60', '45', '16', '58']},
 {'name': 'self-description',
  'samples': ['33', '6', '5', '11', '21', '60', '45', '16']},
 {'name': 'family and upbringing', 'samples': ['6']},
 {'name': 'ethnic identity',
  'samples': ['6', '2', '5', '11', '16', '21', '45', '58', '60']},
 {'name': 'music preferences', 'samples': ['5', '11', '45', '60']},
 {'name': 'media and celebrity influence',
  'samples': ['5', '11', '45', '60', '58']},
 {'name': 'fashion and appearance', 'samples': ['21', '33']},
 {'name': 'emotions and mental state', 'samples': ['11', '60']},
 {'name': 'importance of religion', 'samples': ['6', '2']},
 {'name': 'sports interest', 'samples': ['5', '45', '11', '16', '58']},
 {'name': "self-perception vs. others' perception",
  'samples': ['33', '6', '5', '11', '60', '21', '45', '58', '16']},
 {'name': 'ethnicity and cultural identity',
  'samples': ['6', '2', '58', '11', '45', '60', '21', '16']},
 {'name': 'self

In [None]:
label_data = []
for label in all_labels:
    for sample in label['samples']:
        label_data.append((int(sample), label['name']))

labels_df = pd.DataFrame(label_data, columns=['sample', 'label'])

labels_df.head()

# We use a groupby operation to concatenate tags for the same sample
labels_df = labels_df.groupby('sample')['label'].apply(lambda x: ', '.join(x)).reset_index()

# Merge this tags DataFrame with the main DataFrame
# 'df.index' should match the 'sample' identifiers in the tags DataFrame
df_merged = df.merge(labels_df, left_on='id', right_on='sample', how='left')

df_merged.rename(columns={'label': 'labels'}, inplace=True)

df_merged.head()

Unnamed: 0,name,age,content,len,id,sample,labels
0,Michelle,13,I: Let's begin with ‚how I see myself'. Can yo...,4125,1,1.0,"sports enthusiasts, celebrity admiration, self..."
1,Sapan,13,I: You've just done one collage yes [how I see...,3847,2,2.0,"ethnic identity, importance of religion, ethni..."
2,Jason,13,"I: So Jason, are these two separate collages o...",8701,3,3.0,"sports enthusiasts, celebrity admiration, self..."
3,Malcolm,13,"I: OK Malcolm, are these two separate collages...",6486,4,4.0,"how-others-see-me, celebrities and public figu..."
4,Joe,14,I: So let's start with ‚how I see myself'. Do\...,5451,5,5.0,"self vs. perception, self-description, ethnic ..."


In [None]:
# one hot encode the tags
one_hot = df_merged['labels'].str.get_dummies(sep=', ')
final_df = pd.concat([df_merged, one_hot], axis=1)
final_df.head()

Unnamed: 0,name,age,content,len,id,sample,labels,academics and future aspirations,animal affinity,animal lovers,...,self-awareness and introspection,self-description,self-perception vs external perception,self-perception vs. others' perception,social roles and relationships,sports enthusiasts,sports interest,sports interests,unique perspectives,unique-to-sample-58
0,Michelle,13,I: Let's begin with ‚how I see myself'. Can yo...,4125,1,1.0,"sports enthusiasts, celebrity admiration, self...",0,0,0,...,0,1,0,0,1,1,1,0,0,0
1,Sapan,13,I: You've just done one collage yes [how I see...,3847,2,2.0,"ethnic identity, importance of religion, ethni...",0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Jason,13,"I: So Jason, are these two separate collages o...",8701,3,3.0,"sports enthusiasts, celebrity admiration, self...",0,0,0,...,1,1,0,0,1,1,0,0,0,0
3,Malcolm,13,"I: OK Malcolm, are these two separate collages...",6486,4,4.0,"how-others-see-me, celebrities and public figu...",0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Joe,14,I: So let's start with ‚how I see myself'. Do\...,5451,5,5.0,"self vs. perception, self-description, ethnic ...",0,0,0,...,1,1,0,1,0,0,1,0,0,0


In [None]:
final_df.shape

(65, 49)

In [None]:
final_df.to_csv('qualitative_analysis.csv', index=False)