# Classifier ReadMe

Used to classify posts by cultural <-> political <-> others.
OpenAI API keys required, please update accordingly in the **config.py**

## Step 1: Loading Packages and Meta-attributes

In [2]:
import os
import sys
import openai
import pandas as pd
import numpy as np
sys.path.append('../') 
import config.settings as settings

In [5]:
# Getting meta configuration from the config class
post_data_path = settings.PROJ_PATH+"/post_data"
openAI_api_key = settings.OPENAI_API

In [6]:
client = openai.OpenAI(
  api_key = openAI_api_key
)

In [8]:
def content_gen(body):
   content = f"""
      You are a specialized content classifier for the r/china subreddit. Your task is to categorize the discussion below and try your best to fit them into either political/cultural/other based on the standards below:
      {body}
      CATEGORIES:
      1. POLITICS - Posts about:
         - Government, political parties, policies
         - International relations and diplomacy
         - Laws and regulations
         - Civil rights and activism
         - Current political events
         - Censorship and media control

      2. CULTURAL - Posts about:
         - Traditions and customs
         - Food and cuisine
         - Languages and linguistics
         - Arts and entertainment
         - History and heritage
         - Philosophy and religion
         - Daily life and social norms
         - Education and learning languages
         - Travel and tourism experiences

      3. OTHER

      INSTRUCTIONS:
      1. Analyze the provided post text
      2. Classify it into exactly one of the above categories
      4. **OUTPUT ONLY ONE SINGLE WORD, CULTURAL/POLITICAL/OTHER**"""
   return content

In [7]:
china_post_data = pd.read_csv(post_data_path+"/raw_data_china_post.csv")
hk_post_data = pd.read_csv(post_data_path+"/raw_data_hongkong_post.csv")
tw_post_data = pd.read_csv(post_data_path+"/raw_data_taiwan_post.csv")
dfs = [china_post_data,hk_post_data,tw_post_data]

In [None]:
china_post_data

In [None]:
for i, df in enumerate(dfs):
    for idx, row in df.iterrows(): 
        if row['post_body'] is None:
            continue
        body = row['post_body']
        
        completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": content_gen(body)
                }
            ],
            model="gpt-4o-mini",
        )
        msg_content = completion.choices[0].message.content
        df.at[idx, 'gpt_score'] = msg_content
        print(f"Post{row['post_id']}: "+msg_content)
    dfs[i] = df


In [None]:
dfs[0]

In [None]:
def merge_gpt_classifier_res(cmt_df, res_df, output):
    merged_df = cmt_df.merge(res_df[['post_id','gpt_score']], on='post_id', how='left')
    merged_df.to_csv(output, index=False)

In [None]:
tw_cmt_df = pd.read_csv("post_data/taiwan_comments.csv")
merge_gpt_classifier_res(tw_cmt_df,dfs[2],"tw_scored_pnc_df.csv")

In [None]:
hk_cmt_df = pd.read_csv("post_data/hongkong_comments.csv")
merge_gpt_classifier_res(hk_cmt_df,dfs[1],"hk_scored_pnc_df.csv")

In [None]:
tw_cmt_df = pd.read_csv("post_data/china_comments.csv")
merge_gpt_classifier_res(tw_cmt_df,dfs[0],"cn_scored_pnc_df.csv")