In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
rso_data = pd.read_csv('C:\\Users\\scher\\school\\senior-thesis\\data\\rso_data.txt', index_col=0).drop(columns='Type')

# General Visualizations

In [None]:
all_allocs = rso_data.groupby('Year').sum().reset_index()
all_rsos = rso_data.groupby('Year').count().reset_index()
holistic = pd.DataFrame([all_allocs['Year'], all_rsos['Organization'], all_allocs['Allocation']]).transpose()

holistic = holistic.astype({'Year': 'int32', 'Organization': 'int32'})

holistic['Avg per club'] = (holistic['Allocation'] / holistic['Organization']).apply(round, args=(2,))
holistic

plt.title('Total Allocations to RSOs by Year')
sns.lineplot(x='Year', y='Allocation', data=holistic)

In [None]:
bridges = ['bridges Multicultural Resource Center',
            'Indigenous and Native Coalition (INC) Recruitment and Retention Center',
            'Middle Eastern North African Recruitment and Retention Center',
            'Mixed Student Union',
            'Pilipinx Academic Student Services',
            'Raíces Recruitment and Retention Center',
            'REACH! Asian Pacific American Recruitment and Retention Center']

bridges_data = rso_data[rso_data['Organization'].isin(bridges)]
bridges_hol = bridges_data.groupby('Year').sum().reset_index()[['Year', 'Allocation']]

bridges_hol
# bridges_data[bridges_data['Year'] == 2018]
# bridges_data.sort_values(by=['Organization', 'Year'])

In [None]:
counts = pd.DataFrame(rso_data.groupby(['Year', 'Type']).size()).reset_index()
counts = counts.rename(columns={0: 'Count'})

plt.title('Number of RSOs by Type')
sns.lineplot(x='Year', y='Count', hue='Type', data=counts)

In [None]:
total = counts.groupby('Year').sum()
props = counts.copy(deep=True).rename(columns={'Count': 'Proportion'})
rso_prop = []

for year in total.index:
    idx = counts[counts['Year'] == year].index
    total_count = total.loc[year, 'Count']

    for i in idx:
        rso_prop.append(counts.at[i, 'Count'] / total_count)

props['Proportion'] = rso_prop
sns.barplot(x='Year', y='Proportion', hue='Type', data=props)

In [None]:
allocs = rso_data.groupby(['Year', 'Type']).sum().drop(columns=['Standing']).reset_index()

plt.title('Allocations by RSO Type')
sns.lineplot(x='Year', y='Allocation', hue='Type', data=allocs)

In [None]:
alloc_totals = allocs.groupby('Year').sum()
alloc_props = allocs.copy(deep=True).rename(columns={'Allocation': 'Proportion'})
alloc_prop = []

for year in alloc_totals.index:
    idx = allocs[allocs['Year'] == year].index
    total_alloc = alloc_totals.loc[year, 'Allocation']
    
    for i in idx:
        alloc_prop.append(allocs.at[i, 'Allocation'] / total_alloc)

alloc_props['Proportion'] = alloc_prop
alloc_props

sns.barplot(x='Year', y='Proportion', hue='Type', data=alloc_props)

In [None]:
specific_counts = pd.DataFrame(rso_data.groupby(['Year', 'Designation']).size()).reset_index()
specific_counts = specific_counts.rename(columns={0: 'Count'})

plt.figure(figsize=(30,30))
plt.title('Number of RSOs by Designation')
sns.lineplot(x='Year', y='Count', hue='Designation', data=specific_counts)

In [None]:
specific_allocs = rso_data.groupby(['Year', 'Designation']).sum().drop(columns=['Standing']).reset_index()
# specific_allocs['Allocation'] = normalize(specific_allocs['Allocation'].to_numpy().reshape(-1,1))
plt.figure(figsize=(30,30))
plt.title('Allocations for RSOs by Designation')
plot = sns.lineplot(x='Year', y='Allocation', hue='Designation', data=specific_allocs)
# save = plot.get_figure(
# save.savefig('output.png')

# Clustering

In [None]:
df_2018 = rso_data[rso_data['Year'] == 2018]
df_dummies = pd.get_dummies(df_2018, columns=['Type', 'Designation'])

X = df_dummies[['Standing', 'Allocation']]
X.loc[:] = normalize(X.loc[:])

k_vals = []
inertias = []

for i in np.arange(2, 11):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(X)
    
    k_vals.append(i)
    inertias.append(kmeans.inertia_)
    
plt.plot(k_vals, inertias)

In [None]:
kmeans = KMeans(n_clusters=5)
kmeans.fit(X)

df_2018['label'] = kmeans.labels_

plt.figure(figsize=(20,20))
sns.scatterplot(data=df_2018, x='Standing', y='Allocation', hue='label')

# Misc

In [None]:
rso_18 = rso_data[rso_data['Year'] == 2018]
allocs_18 = rso_18.groupby('Designation').sum()[['Allocation']]
clubs_18 = rso_18.groupby('Designation').count()[['Organization']]
allocs_18['Allocation'] / clubs_18['Organization']
# rso_18[rso_18['Designation'].str.contains('Ethnic')]

In [None]:
rso_data.loc[rso_data[rso_data['Year']==2019].groupby('Designation').Allocation.agg('idxmax')]

In [None]:
rso_data[(rso_data['Year']==2019) &
         (rso_data['Designation']=='Political & Advocacy RSO')].sort_values('Allocation', ascending=False)

In [None]:
rso_data[rso_data['Year']==2017].sort_values('Allocation', ascending=False)

# Cultural RSOs are awarded the most $$, it seems
rso_data[rso_data['Organization']=='Korean American Student Association']