In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df_view = pd.read_csv('./LIWC-22 Results - Chinese_text_with_other_data (___ - LIWC Analysis.csv', 
                      encoding='utf-8-sig').rename(columns={
    'gender':'Gender', 'location':'Location', 'time_1':'Time', 'gender_label':'Category'})

def changeGenderLabel(label):
    if label == 0:
        return 'Crime-related'
    elif label == 1:
        return 'Gender-related'
    else:
        return 'Irrelevant or Ineffective'

def changeGender(gender):
    if gender == 'f':
        return 'Woman'
    else:
        return 'Man'

df_view['Category'] = list(map(changeGenderLabel, df_view['Category'].tolist()))
df_view['Gender'] = list(map(changeGender, df_view['Gender'].tolist()))

In [2]:
def formatTime(time):
    return time.split(',')[0].replace('2022/0', '')

ls_formattedTime = list(map(formatTime, df_view['Time'].tolist()))

df_view['Time'] = ls_formattedTime

df_view_June = df_view.drop(df_view[df_view['Time']=='7/01'].index.tolist())

In [5]:
sns.set_style("darkgrid")

In [54]:
# sns.relplot(data=df_view_June, x="Time", y="WC", kind="line").fig.set_size_inches(8,4)

# Temporal Changes of Linguistic Measures : Mann-Kendall

In [3]:
import numpy as np
import pymannkendall as mk
import statistics as stat

In [4]:
df_crime_June = df_view_June[df_view_June['Category']=='Crime-related']
df_gender_June = df_view_June[df_view_June['Category']=='Gender-related']
df_other_June = df_view_June[df_view_June['Category']=='Irrelevant or Ineffective']

ls_categories = df_view.columns.tolist()[8:]

In [5]:
# Full June series + full data

for cat in ls_categories:
    ls = df_view_June[['Time', cat]].groupby(['Time']).median()
    series = ls[cat].tolist()
    
    res = mk.original_test(series, alpha=0.05)
    
    if res.h == False:
        continue
    if res.p < 0.001:
        p = '***'
    elif res.p < 0.01:
        p = '**'
    elif res.p < 0.05:
        p = '*'
    print("{cat}, {trend}, {s}, {p}, {slope:.2f}".format(
        cat=cat, trend=res.trend, s=res.s, p=p, slope=res.slope))

WPS, decreasing, -75.0, *, -0.17
pronoun, decreasing, -86.0, *, -0.10
prep, decreasing, -81.0, *, -0.13
quanunit, decreasing, -63.0, *, -0.04
general_pa, decreasing, -82.0, *, -0.07
affect, decreasing, -73.0, *, -0.12
social, decreasing, -84.0, *, -0.12
drives, increasing, 67.0, *, 0.06
relativ, increasing, 73.0, *, 0.09
space, decreasing, -80.0, *, -0.11
time, increasing, 155.0, ***, 0.15
informal, decreasing, -86.0, **, -0.09
assent, decreasing, -82.0, *, -0.08


In [6]:
# Full June series + crime data

for cat in ls_categories:
    ls = df_crime_June[['Time', cat]].groupby(['Time']).median()
    series = ls[cat].tolist()
    
    res = mk.original_test(series, alpha=0.05)
    if res.h == False:
        continue
    if res.p < 0.001:
        p = '***'
    elif res.p < 0.01:
        p = '**'
    elif res.p < 0.05:
        p = '*'
    print("{cat}, {trend}, {s}, {p}, {slope:.2f}".format(
        cat=cat, trend=res.trend, s=res.s, p=p, slope=res.slope))

ipron, decreasing, -111.0, ***, -0.03
auxverb, decreasing, -96.0, **, -0.03
negate, increasing, 87.0, **, 0.06
particle, decreasing, -77.0, *, -0.11
compare, increasing, 70.0, *, 0.03
number, increasing, 74.0, *, 0.05
social, decreasing, -79.0, *, -0.06
cause, increasing, 112.0, ***, 0.05
tentat, decreasing, -101.0, **, -0.06
differ, increasing, 69.0, *, 0.05
achieve, increasing, 106.0, **, 0.07
risk, increasing, 91.0, **, 0.04
relativ, decreasing, -85.0, *, -0.10
motion, increasing, 83.0, *, 0.02
space, decreasing, -76.0, *, -0.06


In [7]:
# Full June series + gender data

for cat in ls_categories:
    ls = df_gender_June[['Time', cat]].groupby(['Time']).median()
    series = ls[cat].tolist()
    
    res = mk.original_test(series, alpha=0.05)
    if res.h == False:
        continue
    if res.p < 0.001:
        p = '***'
    elif res.p < 0.01:
        p = '**'
    elif res.p < 0.05:
        p = '*'
    print("{cat}, {trend}, {s}, {p}, {slope:.2f}".format(
        cat=cat, trend=res.trend, s=res.s, p=p, slope=res.slope))

ppron, decreasing, -76.0, *, -0.07
ipron, increasing, 67.0, *, 0.04
tensem, increasing, 91.0, **, 0.08
modal_pa, increasing, 84.0, *, 0.05
differ, decreasing, -66.0, *, -0.07
bio, decreasing, -97.0, **, -0.07
affiliation, decreasing, -72.0, *, -0.05
time, increasing, 132.0, ***, 0.06


In [64]:
# Full June series + other data

for cat in ls_categories:
    ls = df_other_June[['Time', cat]].groupby(['Time']).median()
    series = ls[cat].tolist()
    
    res = mk.original_test(series, alpha=0.01)
    if res.h == False:
        continue
    if res.p < 0.001:
        p = '**'
    elif res.p < 0.01:
        p = '*'
    print("{cat}, {trend}, {s}, {p}, {slope:.2f}".format(
        cat=cat, trend=res.trend, s=res.s, p=p, slope=res.slope))

*** relativ: increasing, True, p=0.0004261374183640587, s=116.0, slope=0.1826076555023923
*** time: increasing, True, p=3.659233779984561e-05, s=127.0, slope=0.22916666666666669
*** informal: decreasing, True, p=0.008330234815334014, s=-88.0, slope=-0.08335664335664339
*** assent: decreasing, True, p=0.0070931131814231385, s=-90.0, slope=-0.13225274725274724
