In [10]:
import pandas as pd
import plotly.express as px

# Reloading the dataset
file_path = 'https://raw.githubusercontent.com/pointOfive/stat130chat130/refs/heads/main/CP/CSCS_data_anon.csv'
data = pd.read_csv(file_path, low_memory=True)

# Drop rows with missing values in the required co
data = data.dropna(subset=['WORK_company_size', 'CONNECTION_reached_out_coworkers'])
# Define category orders
company_size_order = ['Small (1-49 employees)', 'Medium (50-249 employees)', 'Large (250+ employees)']
likelihood_order = ['Very Unlikely', 'Unlikely', 'Somewhat Unlikely', 'Somewhat Likely', 'Likely', 'Very Likely']

# Convert columns to ordered categorical data
data['WORK_company_size'] = pd.Categorical(data['WORK_company_size'], categories=company_size_order, ordered=True)
data['CONNECTION_reached_out_coworkers'] = pd.Categorical(data['CONNECTION_reached_out_coworkers'], categories=likelihood_order, ordered=True)

# Group data for visualization
grouped_data = data.groupby(['WORK_company_size', 'CONNECTION_reached_out_coworkers']).size().reset_index(name='Count')

# Plot a stacked bar chart using plotly
fig = px.bar(
    grouped_data,
    x='WORK_company_size',
    y='Count',
    color='CONNECTION_reached_out_coworkers',
    text='Count',
    title='Likelihood of Reaching Out to Coworkers by Company Size',
    labels={'WORK_company_size': 'Company Size', 'Count': 'Count', 'CONNECTION_reached_out_coworkers': 'Likelihood'},
    barmode='stack',
    color_discrete_sequence=px.colors.qualitative.Vivid
)


# Display the chart
fig.show()



Columns (408,1001,1002,1006,1007,1008,1080,1113,1115,1116,1117,1118,1119,1120,1121,1124,1125,1126,1127,1128,1213,1214,1215,1216,1217,1218,1342,1343,1344,1345,1346,1347,1348,1349,1390,1391,1393,1463,1549,1552,1555,1558,1561) have mixed types. Specify dtype option on import or set low_memory=False.





In [11]:
print(data[['WORK_company_size', 'CONNECTION_reached_out_coworkers']].head())


    WORK_company_size CONNECTION_reached_out_coworkers
59                NaN                              NaN
63                NaN                         Unlikely
132               NaN                  Somewhat Likely
159               NaN                              NaN
172               NaN                           Likely


In [12]:
print(data['WORK_company_size'].isna().mean() * 100)  # Percentage of missing values


100.0


In [13]:
likelihood_distribution = data['CONNECTION_reached_out_coworkers'].value_counts()
print(likelihood_distribution)



CONNECTION_reached_out_coworkers
Somewhat Likely      132
Likely                81
Unlikely              30
Very Unlikely         28
Somewhat Unlikely      0
Very Likely            0
Name: count, dtype: int64
