In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from scipy.special import rel_entr
import statsmodels.api as sm
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')

### merge published date and binary output of racism types

In [70]:
binary_df = pd.read_csv('concepts_binary_output.csv')  # 這是包含歧視類型的文件
date_df = pd.read_csv('articles.csv')  # 這是包含日期與 id 的文件

# 合併資料，根據 `id` 連接兩個表格
date_df = date_df[['id', 'date']]  # 只保留需要的欄位
merged_df = pd.merge(date_df, binary_df, left_on="id", right_on="article_id")
merged_df.drop(columns=['article_id'], inplace=True)  # 刪除多餘的 article_id 欄位

# 確保日期格式正確
merged_df['date'] = pd.to_datetime(merged_df['date'], errors='coerce')

event_date = pd.to_datetime("2021-03-16")
merged_df["period"] = merged_df["date"].apply(lambda x: "0" if x < event_date else "1")


In [3]:
print(merged_df.columns)  # 顯示合併後的資料框架的前幾行

Index(['id', 'date', 'Anti-Asian hate crimes(general)',
       'Anti-Asian hate crimes-physical violence related',
       'Attacked by white or not specified',
       'COVID-19 or coronavirus or pandemic', 'China/Chinese virus',
       'Ching Chong', 'Commie', 'Cultural complicity', 'Discrimination',
       'Exotic (describing Asian women)', 'Fetishized or fetishization',
       'Hypersexualization of Asian women', 'Microaggressions', 'Misogyny',
       'Model Minority Myth (or myth of the model minority)',
       'Online harassment', 'Other minorities (Black) attack AA',
       'Otherized (or othering)', 'Page Act', 'Perpetual Foreigner',
       'Physical harassment', 'Preserve whiteness', 'Racial bias',
       'Racial inequity', 'Racial prejudice/bigotry',
       'Racial injustice/oppression',
       'Racism (gendered, misogynistic) or racism toward Asian American women',
       'Racism (general)', 'Recidivism', 'Scapegoat', 'Sexual violence',
       'Stereotypes', 'Systemic racism',

In [4]:
output_path = "./statistic/merged_binary_chatgpt.csv"
merged_df.to_csv(output_path, index=False)

### Group period, sum each racism types(count)

In [71]:
merged_df = pd.read_csv( "./statistic_chatgpt/merged_binary_chatgpt.csv")
merged_df.drop(columns=['date', 'id'], inplace=True)

# Group by 'period' and sum binary columns
summary_df = merged_df.groupby('period').sum(numeric_only=True)

# Transpose so racism types are rows and periods are columns
summary_df = summary_df.T
summary_df.columns = ['Before_1', 'After_1']  # if your periods are '0' and '1'


not_mentioned_before = 335 - summary_df['Before_1']
not_mentioned_after = 299 - summary_df['After_1']
summary_df.insert(1, 'Before_0', not_mentioned_before)  # Insert after 'Before Event'
summary_df.insert(3, 'After_0', not_mentioned_after)    # Insert after 'After Event'
# summary_df.loc['Sum'] = summary_df.sum()

# Add percentage columns 
# percentages = summary_df.drop(index='Sum').div(summary_df.loc['Sum'], axis=1) * 100
percentages_before = summary_df['Before_1'].div(335) * 100
percentages_after = summary_df['After_1'].div(299) * 100
percentages_before = percentages_before.round(2)
percentages_after = percentages_after.round(2)
summary_df.insert(4, 'Before_1 %', percentages_before)  # Insert after 'Before Event'
summary_df.insert(5, 'After_1 %', percentages_after)    # Insert after 'After Event'



# Display result
print(summary_df)

output_path = "./statistic_chatgpt/count_percentage_chatgpt.csv"
# summary_df.to_csv(output_path, index=True, index_label="Racism Type")



                                                    Before_1  Before_0  \
Anti-Asian hate crimes(general)                          141       194   
Anti-Asian hate crimes-physical violence related          87       248   
Attacked by white or not specified                         5       330   
COVID-19 or coronavirus or pandemic                        9       326   
China/Chinese virus                                      133       202   
Ching Chong                                                4       331   
Commie                                                     4       331   
Cultural complicity                                        4       331   
Discrimination                                           108       227   
Exotic (describing Asian women)                            1       334   
Fetishized or fetishization                                6       329   
Hypersexualization of Asian women                          1       334   
Microaggressions                      

## Racism types

### chi square

In [6]:
# Store results
results = []

# Get list of racism types (exclude 'Sum' and any percent columns)
racism_types = [col for col in summary_df.index]


for racism_type in racism_types:
    # Counts of articles with this racism type
    before_1 = summary_df.loc[racism_type, 'Before_1']
    after_1 = summary_df.loc[racism_type, 'After_1']
    before_0 = summary_df.loc[racism_type, 'Before_0']
    after_0 = summary_df.loc[racism_type, 'After_0']

    
    # Build contingency table
    contingency = [
        [before_1, before_0],
        [after_1, after_0]
    ]
    # Print contingency table
    print(f"Contingency table for {racism_type}:")
    print(pd.DataFrame(contingency, columns=['Before_1', 'Before_0'], index=['After_1', 'After_0']))
    
    # Run chi-square test
    chi2, p, _, _ = chi2_contingency(contingency)
    
    # Store results
    results.append({
        'Racism Type': racism_type,
        'Chi2': chi2,
        'p-value': p,
        'Significant (p < 0.05)': p < 0.05,
        'before_1': before_1,
        'after_1': after_1
    })

# Convert to DataFrame and sort by p-value
chi_df = pd.DataFrame(results).sort_values(by='p-value')
chi_df.to_csv('./statistic/chi2_chatgpt.csv', index=False)
# Display
display(chi_df)


Contingency table for Anti-Asian hate crimes(general):
         Before_1  Before_0
After_1       141       194
After_0       215        84
Contingency table for Anti-Asian hate crimes-physical violence related:
         Before_1  Before_0
After_1        87       248
After_0       145       154
Contingency table for Attacked by white or not specified:
         Before_1  Before_0
After_1         5       330
After_0         9       290
Contingency table for COVID-19 or coronavirus or pandemic:
         Before_1  Before_0
After_1         9       326
After_0        10       289
Contingency table for China/Chinese virus:
         Before_1  Before_0
After_1       133       202
After_0        93       206
Contingency table for Ching Chong:
         Before_1  Before_0
After_1         4       331
After_0        13       286
Contingency table for Commie:
         Before_1  Before_0
After_1         4       331
After_0         5       294
Contingency table for Cultural complicity:
         Before_1

Unnamed: 0,Racism Type,Chi2,p-value,Significant (p < 0.05),before_1,after_1
26,"Racism (gendered, misogynistic) or racism towa...",98.341,3.521979e-23,True,1,81
0,Anti-Asian hate crimes(general),55.842396,7.851985e-14,True,141,215
1,Anti-Asian hate crimes-physical violence related,33.583352,6.827458e-09,True,87,145
27,Racism (general),27.442455,1.618391e-07,True,78,129
11,Hypersexualization of Asian women,22.913524,1.694554e-06,True,1,24
25,Racial injustice/oppression,20.735927,5.271758e-06,True,90,133
19,Perpetual Foreigner,20.572216,5.742351e-06,True,66,108
33,Terrorism,16.530829,4.786546e-05,True,15,42
34,Verbal harassment,11.614986,0.0006542255,True,188,208
10,Fetishized or fetishization,6.640224,0.009970141,True,6,18


### KL divergence

In [11]:
rows = []

epsilon = 1e-10  # small value to avoid division by zero
racism_types = [col for col in summary_df.index]


for racism_type in racism_types:
    before_1 = summary_df.loc[racism_type, 'Before_1']
    after_1 = summary_df.loc[racism_type, 'After_1']
    before_0 = summary_df.loc[racism_type, 'Before_0']
    after_0 = summary_df.loc[racism_type, 'After_0']
    
    # Probabilities
    p_has = before_1 / 335
    p_not = 1 - p_has
    q_has = after_1 / 299
    q_not = 1 - q_has

    # Add small epsilon to avoid log(0)
    P = np.array([p_has + epsilon, p_not + epsilon])
    Q = np.array([q_has + epsilon, q_not + epsilon])
    
    # Compute KL divergence: D_KL(P || Q)
    kl_div = np.sum(rel_entr(P, Q))
    
    rows.append({
        'Racism Type': racism_type,
        'Before_Has': before_1,
        'After_Has': after_1,
        'Before %': round(p_has * 100, 2),
        'After %': round(q_has * 100, 2),
        'KL Divergence': round(kl_div, 6)
    })

# Create DataFrame and sort by divergence
kl_df = pd.DataFrame(rows).sort_values(by='KL Divergence', ascending=False)

# Save to CSV
kl_df.to_csv("./statistic/kl_divergence_chatgpt.csv", index=False)

# Display
display(kl_df)


Unnamed: 0,Racism Type,Before_Has,After_Has,Before %,After %,KL Divergence
26,"Racism (gendered, misogynistic) or racism towa...",1,81,0.3,27.09,0.298568
0,Anti-Asian hate crimes(general),141,215,42.09,71.91,0.193481
1,Anti-Asian hate crimes-physical violence related,87,145,25.97,48.49,0.106385
27,Racism (general),78,129,23.28,43.14,0.086224
11,Hypersexualization of Asian women,1,24,0.3,8.03,0.070616
25,Racial injustice/oppression,90,133,26.87,44.48,0.066082
19,Perpetual Foreigner,66,108,19.7,36.12,0.06426
33,Terrorism,15,42,4.48,14.05,0.049639
34,Verbal harassment,188,208,56.12,69.57,0.040018
38,Yellow peril,10,2,2.99,0.67,0.02176


### logistic regression

In [14]:
results = []

# Get list of racism types
racism_types = [col for col in summary_df.index]

for racism_type in racism_types:
    # Prepare data for logistic regression
    X = merged_df[['period']]
    X = sm.add_constant(X)  # adds intercept
    y = merged_df[racism_type]

    # Check if there is enough variation (avoid crash)
    if y.nunique() < 2:
        continue  # skip racism types that are all 0 or all 1

    # Fit logistic regression model
    model = sm.Logit(y, X).fit(disp=0)
    
    # Extract results
    coef = model.params['period']
    p_value = model.pvalues['period']
    odds_ratio = np.exp(coef)

    results.append({
        'Racism Type': racism_type,
        'Coefficient (log-odds)': round(coef, 4),
        'Odds Ratio': round(odds_ratio, 4),
        'p-value': round(p_value, 4),
        'Significant (p < 0.05)': p_value < 0.05
    })

# Create and sort result DataFrame
logit_df = pd.DataFrame(results).sort_values(by='p-value')

# Save to CSV
logit_df.to_csv('./statistic/logistic_regression_chatgpt.csv', index=False)

# Show top results
display(logit_df)





Unnamed: 0,Racism Type,Coefficient (log-odds),Odds Ratio,p-value,Significant (p < 0.05)
0,Anti-Asian hate crimes(general),0.697,2.0077,0.0,True
4,China/Chinese virus,-0.7347,0.4796,0.0,True
26,"Racism (gendered, misogynistic) or racism towa...",4.5542,95.0315,0.0,True
1,Anti-Asian hate crimes-physical violence related,0.6204,1.8596,0.0002,True
27,Racism (general),0.5734,1.7743,0.0009,True
33,Terrorism,0.9984,2.714,0.0014,True
11,Hypersexualization of Asian women,3.1354,22.9968,0.0022,True
19,Perpetual Foreigner,0.5186,1.6796,0.0044,True
8,Discrimination,-0.4466,0.6398,0.0097,True
25,Racial injustice/oppression,0.4246,1.529,0.0117,True


### co occurence

In [15]:
# Get the list of racism type columns
racism_types = [col for col in summary_df.index]

# Compute co-occurrence matrix for ALL articles
# co_occurrence_all = merged_df[racism_types].T.dot(merged_df[racism_types])

# Optional: Normalize by diagonal to get co-occurrence rate (i.e., percentage of co-appearance)
# co_occurrence_rate = co_occurrence_all / co_occurrence_all.values.diagonal().reshape(-1, 1)

# Compute co-occurrence matrices separately for before and after the event
before_df = merged_df[merged_df['period'] == 0]
after_df = merged_df[merged_df['period'] == 1]

co_before = before_df[racism_types].T.dot(before_df[racism_types])
co_after = after_df[racism_types].T.dot(after_df[racism_types])

# Save to CSV if needed
# co_occurrence_all.to_csv("co_occurrence_all.csv")
co_before.to_csv("./statistic/co_occurrence_before.csv")
co_after.to_csv("./statistic/co_occurrence_after.csv")


## Term Frequency

### merge files

In [33]:
classified_df = pd.read_csv("classification_results_with_race_gpt.csv")
article_df = pd.read_csv("articles.csv")

classified_df = classified_df[['article_id', 'context', 'concepts']]
article_df = article_df[['id', 'date']]

merged_df = pd.merge(article_df, classified_df, left_on="id", right_on="article_id")
merged_df.drop(columns=['article_id'], inplace=True)  # 刪除多餘的 article_id 欄位

# 確保日期格式正確
merged_df['date'] = pd.to_datetime(merged_df['date'], errors='coerce')

event_date = pd.to_datetime("2021-03-16")
merged_df["period"] = merged_df["date"].apply(lambda x: "0" if x < event_date else "1")

display(merged_df.head()) 
display(merged_df.tail()) 

merged_df.to_csv("./statistic_chatgpt/merged_classification_chatgpt.csv", index=False)

Unnamed: 0,id,date,context,concepts,period
0,1,2020-04-21,Racism has its own virulence that is bad for t...,Racism (general),0
1,1,2020-04-21,But my parents were spurred to action after th...,Terrorism,0
2,1,2020-04-21,"In recent years, they raised their voices agai...",Racial injustice/oppression,0
3,1,2020-04-21,The resilience I learned from them has buoyed ...,Racial Prejudice/bigotry,0
4,1,2020-04-21,"Now, I must speak out in the face of new attac...",Anti-Asian hate crimes(general),0


Unnamed: 0,id,date,context,concepts,period
4833,634,2021-02-17,"For example, earlier this year, a 91-year-old ...",Anti-Asian hate crimes-physical violence related,0
4834,634,2021-02-17,The hate often is linked to individuals blamin...,Scapegoat,0
4835,634,2021-02-17,Contrast this to the previous administration w...,Racial injustice/oppression,0
4836,634,2021-02-17,Asian,,0
4837,634,2021-02-17,Whether it? asking young Asian-Americans abou...,Kung flu/plague,0


### for all articles

In [38]:
sentences_before = merged_df[merged_df['period'] == '0']['context'].dropna().tolist()
sentences_after = merged_df[merged_df['period'] == '1']['context'].dropna().tolist()
stop_words = set(stopwords.words('english'))

def tokenize(text):
    words = text.split()
    tokens = []
    for word in words:
        token = ''.join(char for char in word if char.isalnum())
        tokens.append(token)
    return tokens

def clean_text(text):
    text = text.lower()                         # Lowercase
    tokens = tokenize(text)                # Tokenize
    tokens = [word for word in tokens if word.isalpha()]  # Keep only alphabetic words
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

# Apply cleaning
sentences_before = [clean_text(s) for s in sentences_before]
sentences_after = [clean_text(s) for s in sentences_after]

# 假設 sentences_before, sentences_after 是該歧視類別對應的句子
vec = CountVectorizer(binary=True)
X_before = vec.fit_transform(sentences_before)
X_after = vec.transform(sentences_after)

freq_before = X_before.sum(axis=0).A1
freq_after = X_after.sum(axis=0).A1
vocab = vec.get_feature_names_out()

df_freq = pd.DataFrame({'word': vocab, 'before': freq_before, 'after': freq_after})
# Rank words by frequency (higher freq = lower rank number, like 1st place)
df_freq['rank_before'] = df_freq['before'].rank(method='min', ascending=False).astype(int)
df_freq['rank_after'] = df_freq['after'].rank(method='min', ascending=False).astype(int)

# Top 10 words before
df_before_top = df_freq.sort_values('before', ascending=False).head(10)
display(df_before_top)

# Top 10 words after
df_after_top = df_freq.sort_values('after', ascending=False).head(10)
display(df_after_top)

# df_freq.sort_values(df_freq['before'] + df_freq['after'], ascending=False).to_csv("./statistic_chatgpt/word_frequency_chatgpt.csv", index=False)
df_freq['total'] = df_freq['before'] + df_freq['after']
df_freq = df_freq.sort_values('total', ascending=False)
df_freq = df_freq.drop(columns='total')
df_freq.to_csv("./statistic_chatgpt/word_frequency_chatgpt.csv", index=False)



Unnamed: 0,word,before,after,rank_before,rank_after
421,asian,575,985,1,1
5701,said,472,580,2,2
1102,chinese,335,194,3,12
4801,people,317,347,4,5
253,americans,294,392,5,4
7149,virus,238,155,6,21
2967,hate,236,432,7,3
1084,china,223,178,8,16
4706,pandemic,210,265,9,7
1478,coronavirus,201,117,10,38


Unnamed: 0,word,before,after,rank_before,rank_after
421,asian,575,985,1,1
5701,said,472,580,2,2
2967,hate,236,432,7,3
253,americans,294,392,5,4
4801,people,317,347,4,5
248,american,180,321,12,6
4706,pandemic,210,265,9,7
311,antiasian,117,261,24,8
7357,women,24,236,240,9
7141,violence,81,225,40,10


In [31]:
count = ((df_freq['before'] > 15) & (df_freq['after'] > 15)).sum()
print(count)


333


### log ratio

In [40]:
df_freq['log_ratio'] = np.log2((df_freq['after'] + 1) / (df_freq['before'] + 1))
df_freq[(df_freq['before'] + df_freq['after']) > 5]
# mask = (df_freq['before'] + df_freq['after']) > 5

# Sort with mask True rows first, then by log_ratio
# df_freq = df_freq.sort_values(by=[mask, 'log_ratio'], ascending=[False, False])
df_freq = df_freq.sort_values('log_ratio', ascending=False)
df_freq.to_csv("./statistic_chatgpt/word_log_ratio_more_chatgpt.csv", index=False)
df_freq = df_freq.sort_values('log_ratio', ascending=True)
df_freq.to_csv("./statistic_chatgpt/word_log_ratio_less_chatgpt.csv", index=False)

### chi2

In [35]:
# Set the total number of contexts before and after the event
# You must provide or calculate these
total_contexts_before = 2227
total_contexts_after = 2612
results = []

for _, row in df_freq.iterrows():
    word = row['word']
    count_before = row['before']  # contexts before event where word appeared
    count_after = row['after']   # contexts after event where word appeared

    # Build the contingency table
    table = [[count_before, total_contexts_before - count_before],
             [count_after, total_contexts_after - count_after]]
    
    chi2, p, dof, expected = chi2_contingency(table)
    
    results.append({
        'word': word,
        'before_contexts': count_before,
        'after_contexts': count_after,
        'chi2': chi2,
        'p_value': p,
        'Significant (p < 0.05)': p < 0.05,
    })

# Create a DataFrame from results
chi2_df = pd.DataFrame(results)
chi2_df = chi2_df.sort_values(by='p_value')

# Filter significant results if needed
# significant_words = chi2_df[chi2_df['p_value'] < 0.05]

# Save results
chi2_df.to_csv('./statistic_chatgpt/chi2_freq_chatgpt.csv', index=False)



### for each racism type

In [None]:
summary_results = []  # To store (racism_type, len_before, len_after, num_words)

# STEP 1: Expand 'concepts' into multiple rows
exploded_df = pd.read_csv('./statistic_chatgpt/merged_classification_chatgpt.csv')
exploded_df['concepts'] = exploded_df['concepts'].fillna('')  # handle NaNs
exploded_df['concepts'] = exploded_df['concepts'].str.split(';')
exploded_df = exploded_df.explode('concepts')
exploded_df['concepts'] = exploded_df['concepts'].str.strip()  # remove spaces

# STEP 2: Drop empty concepts (if any)
exploded_df = exploded_df[exploded_df['concepts'] != '']
# display(exploded_df[11:20])

racism_types = [col for col in summary_df.index]

for racism_type in racism_types:
    print(racism_type)
    # Filter rows that include the current racism_type
    # type_df = exploded_df[exploded_df['concepts'].apply(lambda x: racism_type in x)]
    type_df = exploded_df[exploded_df['concepts'] == racism_type]
    # display(type_df.head(10))
    # Get before and after texts
    sentences_before = type_df[type_df['period'] == 0]['context'].dropna().tolist()
    sentences_after = type_df[type_df['period'] == 1]['context'].dropna().tolist()
    print(len(sentences_before), len(sentences_after))
    # Clean the texts
    sentences_before = [clean_text(s) for s in sentences_before]
    sentences_after = [clean_text(s) for s in sentences_after]

    if all(s.strip() == '' for s in sentences_before) and all(s.strip() == '' for s in sentences_after):
        continue

    # Skip if not enough data
    if len(sentences_before) == 0 and len(sentences_after) == 0:
        continue

    # Vectorize
    vec = CountVectorizer(binary=True)
    if len(sentences_before) == 0:
        X_after = vec.fit_transform(sentences_after)
        freq_after = X_after.sum(axis=0).A1
        vocab = vec.get_feature_names_out()
        freq_before = [0] * len(vocab)  # no data, so zero count
    elif len(sentences_after) == 0:
        X_before = vec.fit_transform(sentences_before)
        freq_before = X_before.sum(axis=0).A1
        vocab = vec.get_feature_names_out()
        freq_after = [0] * len(vocab)  # no data, so zero count
    else:
        X_before = vec.fit_transform(sentences_before)
        X_after = vec.transform(sentences_after)  # use same vocab
        freq_before = X_before.sum(axis=0).A1
        freq_after = X_after.sum(axis=0).A1
        vocab = vec.get_feature_names_out()

    summary_results.append({
        'racism_type': racism_type,
        'num_contexts_before': len(sentences_before),
        'num_contexts_after': len(sentences_after)
        # 'num_words_in_vocab': len(vocab)
    })

    df_freq = pd.DataFrame({
        'word': vocab,
        'before': freq_before,
        'after': freq_after
    })

    df_freq['rank_before'] = df_freq['before'].rank(method='min', ascending=False).astype(int)
    df_freq['rank_after'] = df_freq['after'].rank(method='min', ascending=False).astype(int)

    df_freq['total'] = df_freq['before'] + df_freq['after']
    df_freq = df_freq[df_freq['total'] > 5]  # Optional threshold
    df_freq = df_freq.sort_values('total', ascending=False).drop(columns='total')

    safe_type = racism_type.replace(' ', '_').replace('/', '_')
    # #print(safe_type)
    # Top 10 words before
    df_before_top = df_freq.sort_values('before', ascending=False).head(3)
    # display(df_before_top)

    # Top 10 words after
    df_after_top = df_freq.sort_values('after', ascending=False).head(3)
    # display(df_after_top)

    safe_type = racism_type.replace(' ', '_').replace('/', '_')
    df_freq.to_csv(f"./statistic_chatgpt/word_frequency_chatgpt/{safe_type}.csv", index=False)
summary_results = pd.DataFrame(summary_results)
summary_results.to_csv('./statistic_chatgpt/word_frequency_chatgpt/racism_type_word_freq_summary.csv', index=False)




Anti-Asian hate crimes(general)
259 472
Anti-Asian hate crimes-physical violence related
100 207
Attacked by white or not specified
5 13
COVID-19 or coronavirus or pandemic
12 10
China/Chinese virus
199 121
Ching Chong
5 12
Commie
4 5
Cultural complicity
4 2
Discrimination
165 145
Exotic (describing Asian women)
1 2
Fetishized or fetishization
5 22
Hypersexualization of Asian women
0 31
Microaggressions
12 17
Misogyny
1 5
Model Minority Myth (or myth of the model minority)
15 12
Online harassment
26 41
Other minorities (Black) attack AA
9 9
Otherized (or othering)
8 14
Page Act
6 11
Perpetual Foreigner
0 1
Physical harassment
121 125
Preserve whiteness
15 11
Racial bias
44 55
Racial inequity
69 37
Racial prejudice/bigotry
182 185
Racial injustice/oppression
177 280
Racism (gendered, misogynistic) or racism toward Asian American women
1 111
Racism (general)
110 184
Recidivism
0 1
Scapegoat
234 186
Sexual violence
1 7
Stereotypes
22 18
Systemic racism
81 96
Terrorism
17 42
Verbal harassm

### chi2

In [80]:
results = []

total_df = pd.read_csv("./statistic_chatgpt/word_frequency_chatgpt/racism_type_word_freq_summary.csv")

racism_types = [col for col in summary_df.index]
# print(racism_types)
for racism_type in racism_types:
    safe_type = racism_type.replace(' ', '_').replace('/', '_')
    #　print(safe_type)
    total_contexts_before = total_df[total_df['racism_type'] == racism_type]['num_contexts_before'].iloc[0]
    total_contexts_after = total_df[total_df['racism_type'] == racism_type]['num_contexts_after'].iloc[0]
    word_freq_df = pd.read_csv(f"./statistic_chatgpt/word_frequency_chatgpt/{safe_type}.csv")


    for _, row in word_freq_df.iterrows():
        word = row['word']
        count_before = row['before']  # contexts before event where word appeared
        count_after = row['after']   # contexts after event where word appeared

        # Build the contingency table
        table = [[count_before, total_contexts_before - count_before],
                [count_after, total_contexts_after - count_after]]
        
        try:
            chi2, p, dof, expected = chi2_contingency(table)
        except ValueError:
            continue
        
        results.append({
            'Racism Type': racism_type,
            'word': word,
            'before_contexts': count_before,
            'after_contexts': count_after,
            'chi2': chi2,
            'p_value': p,
            'Significant': p < 0.05,
        })

    # Convert to DataFrame
chi2_df = pd.DataFrame(results)
chi2_df = chi2_df[chi2_df['Significant'] == True]

# Sort by racism_type first, then p_value within each group
chi2_df = chi2_df.sort_values(by=['Racism Type', 'p_value'])

# Optional: Reset index for cleanliness
chi2_df.reset_index(drop=True, inplace=True)

chi2_df.to_csv('./statistic_chatgpt/word_frequency_chatgpt/chi2.csv', index=False)



### log ratio

In [None]:
df_freq['log_ratio'] = np.log2((df_freq['after'] + 1) / (df_freq['before'] + 1))
df_freq[(df_freq['before'] + df_freq['after']) > 5]
# mask = (df_freq['before'] + df_freq['after']) > 5

# Sort with mask True rows first, then by log_ratio
# df_freq = df_freq.sort_values(by=[mask, 'log_ratio'], ascending=[False, False])
df_freq = df_freq.sort_values('log_ratio', ascending=False)
df_freq.to_csv("./statistic_chatgpt/word_log_ratio_more_chatgpt.csv", index=False)
df_freq = df_freq.sort_values('log_ratio', ascending=True)
df_freq.to_csv("./statistic_chatgpt/word_log_ratio_less_chatgpt.csv", index=False)

In [85]:
results = []

total_df = pd.read_csv("./statistic_chatgpt/word_frequency_chatgpt/racism_type_word_freq_summary.csv")

racism_types = [col for col in summary_df.index]
# print(racism_types)
for racism_type in racism_types:
    safe_type = racism_type.replace(' ', '_').replace('/', '_')
    #　print(safe_type)
    total_contexts_before = total_df[total_df['racism_type'] == racism_type]['num_contexts_before'].iloc[0]
    total_contexts_after = total_df[total_df['racism_type'] == racism_type]['num_contexts_after'].iloc[0]
    word_freq_df = pd.read_csv(f"./statistic_chatgpt/word_frequency_chatgpt/{safe_type}.csv")


    for _, row in word_freq_df.iterrows():
        word = row['word']
        count_before = row['before']  # contexts before event where word appeared
        count_after = row['after']   # contexts after event where word appeared

        if count_before + count_after < 5:
            continue

        log_ratio = np.log2(
            ((count_after / total_contexts_after) + 1) / ((count_before / total_contexts_before) + 1)
        )

        results.append({
            'Racism Type': racism_type,
            'word': word,
            'before_contexts': count_before,
            'after_contexts': count_after,
            'log2_ratio': log_ratio
        })

logratio_df = pd.DataFrame(results)
logratio_df = logratio_df[abs(logratio_df['log2_ratio']) > 0.01]  # Replace inf with NaN
# Sort by racism_type and log2_ratio (descending for increased usage)
logratio_df = logratio_df.sort_values(by=['Racism Type', 'log2_ratio'], ascending=[True, False])

# Show top rows
display(logratio_df.head(10))

# Save to CSV
logratio_df.to_csv('./statistic_chatgpt/word_frequency_chatgpt/log_ratio.csv', index=False)



  ((count_after / total_contexts_after) + 1) / ((count_before / total_contexts_before) + 1)


Unnamed: 0,Racism Type,word,before_contexts,after_contexts,log2_ratio
1,Anti-Asian hate crimes(general),asian,94,228,0.121852
5,Anti-Asian hate crimes(general),antiasian,36,100,0.089465
46,Anti-Asian hate crimes(general),women,2,34,0.089253
32,Anti-Asian hate crimes(general),many,8,38,0.067823
65,Anti-Asian hate crimes(general),cities,1,23,0.063082
22,Anti-Asian hate crimes(general),according,13,45,0.060723
12,Anti-Asian hate crimes(general),stop,23,63,0.058009
54,Anti-Asian hate crimes(general),center,5,27,0.052667
20,Anti-Asian hate crimes(general),march,16,47,0.050468
76,Anti-Asian hate crimes(general),study,2,20,0.048774
