# Social variables in the use of requests

In [None]:
import pandas as pd
import altair as alt

In [None]:
utterances = (pd.read_csv('../in/stat-report/corpus_sent_request_classification_config05.csv')
 .query('training == 1')
)

len(utterances)

49997

replace unknown values with NaN

In [None]:
utterances = utterances.assign(
    age=lambda df: df['age'].replace('Unknown', pd.NA),
    education=lambda df: df['education'].replace('9_unknown', pd.NA),
    socGrade=lambda df: df['socGrade'].replace('unknown', pd.NA)
)

# age

In [None]:
variable = 'age'

In [None]:
def get_total_counts (utterances: pd.DataFrame, variable: str) -> pd.DataFrame:
	return (utterances
	.value_counts(variable)
	.reset_index().sort_values(variable)
	)

In [None]:
counts_all = get_total_counts(utterances, variable)
counts_all

Unnamed: 0,age,count
7,0_10,556
5,11_18,2359
0,19_29,21622
3,30_39,5929
1,40_49,8519
2,50_59,6452
4,60_69,2721
6,70_79,1123
8,80_89,67


In [None]:
def get_var_counts_requests(utterances: pd.DataFrame, variable: str) -> pd.DataFrame:
	requests = utterances.query('labels_preds == 1')
	return (requests
	.value_counts(variable)
	.reset_index().sort_values(variable)
	)

In [None]:
counts_requests = get_var_counts_requests(utterances, variable)
counts_requests

Unnamed: 0,age,count
7,0_10,8
5,11_18,22
0,19_29,165
2,30_39,50
3,40_49,41
1,50_59,53
4,60_69,26
6,70_79,11
8,80_89,1


In [None]:
def merge_counts(df1: pd.DataFrame, df2: pd.DataFrame, variable: str) -> pd.DataFrame:
	return (pd.merge(df1, df2, on=variable, how='outer', suffixes=('_all', '_requests'))
		.assign(requests_rate = lambda df: df['count_requests'] / df['count_all'] * 100)
	)

In [None]:
counts_merged = merge_counts(counts_all, counts_requests, variable)
counts_merged

Unnamed: 0,age,count_all,count_requests,requests_rate
0,0_10,556,8,1.438849
1,11_18,2359,22,0.932599
2,19_29,21622,165,0.763112
3,30_39,5929,50,0.843313
4,40_49,8519,41,0.481277
5,50_59,6452,53,0.821451
6,60_69,2721,26,0.955531
7,70_79,1123,11,0.979519
8,80_89,67,1,1.492537


In [None]:
chart = alt.Chart(counts_merged).mark_bar().encode(
	x=variable,
	y='requests_rate'
)

chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [None]:
# chart.save(f'../out/request-rate_{variable}.png', scale_factor=3.0)

# gender

In [None]:
variable = 'gender'

In [None]:
counts_all = get_total_counts(utterances, variable)
counts_all

Unnamed: 0,gender,count
0,F,29731
1,M,20255
2,X,11


In [None]:
counts_requests = get_var_counts_requests(utterances, variable)
counts_requests

Unnamed: 0,gender,count
0,F,220
1,M,163


In [None]:
counts_merged = merge_counts(counts_all, counts_requests, variable)
counts_merged

Unnamed: 0,gender,count_all,count_requests,requests_rate
0,F,29731,220.0,0.739968
1,M,20255,163.0,0.80474
2,X,11,,


In [None]:
chart = alt.Chart(counts_merged).mark_bar().encode(
	x=variable,
	y='requests_rate'
)

chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [None]:
# chart.save(f'../out/request-rate_{variable}.png', scale_factor=3.0)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


# education

In [None]:
variable = 'education'

In [None]:
counts_all = get_total_counts(utterances, variable)
counts_all

Unnamed: 0,education,count
4,1_primary,59
3,2_secondary,4736
2,3_sixthform,10501
0,4_graduate,21165
1,5_postgrad,11797


In [None]:
counts_requests = get_var_counts_requests(utterances, variable)
counts_requests

Unnamed: 0,education,count
4,1_primary,1
3,2_secondary,48
1,3_sixthform,103
0,4_graduate,152
2,5_postgrad,60


In [None]:
counts_merged = merge_counts(counts_all, counts_requests, variable)
counts_merged

Unnamed: 0,education,count_all,count_requests,requests_rate
0,1_primary,59,1,1.694915
1,2_secondary,4736,48,1.013514
2,3_sixthform,10501,103,0.980859
3,4_graduate,21165,152,0.718167
4,5_postgrad,11797,60,0.508604


In [None]:
chart = alt.Chart(counts_merged).mark_bar().encode(
	x=variable,
	y='requests_rate'
)

chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [None]:
# chart.save(f'../out/request-rate_{variable}.png', scale_factor=3.0)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


# social grade

In [None]:
variable = 'socGrade'

In [None]:
counts_all = get_total_counts(utterances, variable)
counts_all

Unnamed: 0,socGrade,count
3,A,6697
1,B,13253
2,C1,8888
4,D,3071
0,E,15915


In [None]:
counts_requests = get_var_counts_requests(utterances, variable)
counts_requests

Unnamed: 0,socGrade,count
3,A,32
1,B,104
2,C1,60
4,D,23
0,E,150


In [None]:
counts_merged = merge_counts(counts_all, counts_requests, variable)
counts_merged

Unnamed: 0,socGrade,count_all,count_requests,requests_rate
0,A,6697,32,0.477826
1,B,13253,104,0.784728
2,C1,8888,60,0.675068
3,D,3071,23,0.748942
4,E,15915,150,0.942507


In [None]:
chart = alt.Chart(counts_merged).mark_bar().encode(
	x=variable,
	y='requests_rate'
)

chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [None]:
# chart.save(f'../out/request-rate_{variable}.png', scale_factor=3.0)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
