# STA130 Course Project - Analysis 2 Project Proposal
***
#### **Author**: David Daniliuc<br>**Created**: Sat Nov. 2, 2024

*Jupyter Python Notebook for Analysis 2 testing and experimenting for the STA130 Course Project.* 

In [162]:

# Import statements
import pandas as pd
import numpy as np
import plotly.express as px
import scipy.stats as stats

# Read and import dataset
cols = pd.read_csv("var_names.csv")
data = pd.read_csv("CSCS_data_anon.csv", na_values=["9999", "", " ", "Presented but no response", "NA"])

# Cleaning the data
empty = (data.isna().sum() == data.shape[0])
data = data[empty.index[~empty]] # keep non empty columns only

data = data[data.REMOVE_case=='No'].copy()


Columns (129,408,630,671,689,978,1001,1002,1006,1007,1008,1080,1113,1115,1116,1117,1118,1119,1120,1121,1124,1125,1126,1127,1128,1213,1214,1215,1216,1217,1218,1263,1266,1342,1343,1344,1345,1346,1347,1348,1349,1390,1391,1393,1439,1442,1463,1546,1549,1552,1555,1558,1561) have mixed types. Specify dtype option on import or set low_memory=False.



In [163]:
data_cross_2023 = data[data.DATASET=='2023 Cross-Sectional'].copy()

print(data_cross_2023.shape)

# Remove rows with missing values
data_cross_2023 = data_cross_2023.dropna(subset=['SUBSTANCE_USE_drugs_cannabis', 'CONNECTION_social_time_friends_p7d'])

# Remove columns that have more missing values than missing_limit
missing_limit = 1000
keptColumns = data_cross_2023.isna().sum() < missing_limit
keptColumns = keptColumns.index[keptColumns]
data_cross_2023 = data_cross_2023[keptColumns].copy()

print(data_cross_2023.shape)

# Calculate Z-scores
z_scores = stats.zscore(data_cross_2023['CONNECTION_social_time_friends_p7d'])
threshold = 3

# Filter out outliers based on Z-score
data_cross_2023 = data_cross_2023[(z_scores < threshold) & (z_scores > -threshold)]

print(data_cross_2023.shape)

(2936, 1779)
(308, 1779)
(301, 1779)


In [164]:
import pandas as pd
import plotly.express as px
import scipy.stats as stats

# Define the ordered categories for cannabis use
cannabis_order = ['Not in the past six months', 
                  'Less than monthly', 
                  'Monthly', 
                  'A few times a month', 
                  'Weekly', 
                  'A few times a week', 
                  'Daily or almost daily']

# Create the box plot with flipped axes and ordered cannabis use
fig = px.box(data_cross_2023, 
             y='SUBSTANCE_USE_drugs_cannabis',  # Cannabis use is on the y-axis
             x='CONNECTION_social_time_friends_p7d',  # Time with friends is now on the x-axis
             title='Box Plot of Cannabis Use by Time with Friends',
             labels={'CONNECTION_social_time_friends_p7d': 'Time with Friends (7 Days)', 
                     'SUBSTANCE_USE_drugs_cannabis': 'Cannabis Use'},
             category_orders={'SUBSTANCE_USE_drugs_cannabis': cannabis_order})  # Sort by cannabis order

# Add a scatter plot layer to show all individual data points
fig.add_trace(px.strip(data_cross_2023, 
                        y='SUBSTANCE_USE_drugs_cannabis',  # Cannabis use is on the y-axis
                        x='CONNECTION_social_time_friends_p7d',  # Time with friends is on the x-axis
                        stripmode='overlay').data[0])  # Only take the first trace of the strip plot

# Show the figure
fig.show()

In [165]:
import pandas as pd
import plotly.express as px
import scipy.stats as stats
import statsmodels.api as sm

# Define the ordered categories for cannabis use
cannabis_order = ['Not in the past six months', 
                  'Less than monthly', 
                  'Monthly', 
                  'A few times a month', 
                  'Weekly', 
                  'A few times a week', 
                  'Daily or almost daily']

# Convert cannabis use to a categorical variable
data_cross_2023['SUBSTANCE_USE_drugs_cannabis'] = pd.Categorical(
    data_cross_2023['SUBSTANCE_USE_drugs_cannabis'],
    categories=cannabis_order,
    ordered=True
)

# Calculate the Spearman correlation
correlation, p_value = stats.spearmanr(data_cross_2023['SUBSTANCE_USE_drugs_cannabis'].cat.codes, 
                                        data_cross_2023['CONNECTION_social_time_friends_p7d'])

print(f"Spearman correlation: {correlation}, p-value: {p_value}")


# Chi-Squared Test
contingency_table = pd.crosstab(data_cross_2023['SUBSTANCE_USE_drugs_cannabis'], 
                                 data_cross_2023['CONNECTION_social_time_friends_p7d'])
chi2_stat, chi2_p, _, _ = stats.chi2_contingency(contingency_table)

print(f"Chi-Squared Test: chi2_stat = {chi2_stat}, p-value = {chi2_p}")


Spearman correlation: -0.04141044316098806, p-value: 0.4741382359798334
Chi-Squared Test: chi2_stat = 104.85514565688923, p-value = 0.5677313987750348
