In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql import types 
from pyspark.sql.functions import *
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors  # Pre 2.0 pyspark.mllib.linalg
from pyspark.ml.feature import VectorAssembler
#Kmeans 
from pyspark.ml.clustering import KMeans
# Clustering Evaluator
from pyspark.ml.evaluation import ClusteringEvaluator


ss = SparkSession.builder.config('spark.driver.extraClassPath',
                                 'postgresql-42.2.18.jar') \
                        .config('spark.driver.memory',
                                 '8g') \
                        .config('spark.executor.memory',
                                 '8g') \
                         .getOrCreate()
sc = ss.sparkContext

In [2]:
# #df.write.csv('mycsv.csv')
# df_pca.write.parquet('/Users/christabelle/classes/Spring/DistrComp_694/FoxInsight (1)/df_pca')

In [3]:
# data = ss.read.parquet('/Users/christabelle/classes/Spring/DistrComp_694/FoxInsight (1)/df_pca/*')

In [4]:
# load data locally
df_pca = ss.read.parquet("df_pca")

In [5]:
kmeans =  KMeans(k = 6, maxIter = 200, tol = 0.01, featuresCol='pca', seed=11) 
model = kmeans.fit(df_pca)
df_kmeans = model.transform(df_pca)

In [6]:
df_kmeans.select('prediction').distinct().show()

+----------+
|prediction|
+----------+
|         1|
|         3|
|         5|
|         4|
|         2|
|         0|
+----------+



In [7]:
print(df_kmeans.columns)

['COVPDDiag', 'COVSympInfect', 'COVProDiag', 'COVTest', 'COVFever', 'COVCough', 'COVSympChill', 'COVSympSweat', 'COVSympShortBreath', 'COVSympChestTight', 'COVSympChestPain', 'COVSympTired', 'COVSympSleepy', 'COVSympSoreThroat', 'COVSympCongest', 'COVSympMuscleAche', 'COVSympJointPain', 'COVSympLoseSmell', 'COVSympLoseTase', 'COVSympLoseApetite', 'COVSympDiarrhea', 'COVSympStomachPain', 'COVSympNausea', 'COVSympVomit', 'COVSympLightHead', 'COVSelfIsolate', 'COVLivingSituation', 'COVRegion', 'COVSocialDistancing', 'COVAlterPhysTher', 'COVAlterSpeechTher', 'COVAlterOccTher', 'COVAlterMenHealth', 'COVAlterSpprtGrp', 'COVAlterExercise', 'COVAlterSeeFamily', 'COVAlterSeeFriend', 'COVAlterCommunAct', 'COVAlterReligAct', 'COVAlterVolntrAct', 'COVAlterClinVisit', 'COVAlterOthMedApt', 'COVSocDisWork', 'COVSocDisHouseFinance', 'COVRschAttdtChange', 'COVRschPartcpLikely', 'fox_insight_id', 'age', 'Sex', 'RaceW', 'RaceAA', 'RaceAI', 'RaceA', 'EthnNotHispanic', 'EthnMexican', 'EthnPuerto', 'EthnLat

In [8]:
# Center of each cluster
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)
    print(center.shape)

Cluster Centers: 
[ 2.41841156e+04  8.13479593e-01 -1.11799012e+01 -4.07345454e+00
  5.62150128e+00 -1.19350663e+01 -3.35388423e+00  2.98856784e+00
 -3.00671258e+00  7.42716609e-01 -2.70878370e+00 -2.14625984e-01
  4.07236654e-01  3.55162065e+00  5.27999631e+00]
(15,)
[ 4.07254176e+04  1.17105735e+00 -1.14340086e+01 -3.46792986e+00
  6.34456874e+00 -1.14061546e+01 -3.46082635e+00  1.96920517e+00
 -3.51747620e+00  6.61113304e-01 -3.07844759e+00 -5.14982503e-01
  4.26744103e-01  3.41357928e+00  5.27916576e+00]
(15,)
[ 2.90126428e+04  5.02173456e-01 -1.15281831e+01 -4.26883902e+00
  5.94289619e+00 -1.17486841e+01 -3.41079925e+00  2.19722321e+00
 -3.63461133e+00  7.12588241e-01 -2.29695647e+00 -6.63014403e-01
  2.18599096e-01  3.66333870e+00  5.18807352e+00]
(15,)
[ 1.52620164e+04  1.39059943e+00 -9.83911714e+00 -3.26270953e+00
  6.95304116e+00 -1.05187140e+01 -3.85508793e+00  2.12294335e+00
 -3.69787163e+00  1.10402750e+00 -2.79939714e+00 -7.77671211e-01
  6.34860451e-01  3.36280528e+00  

In [9]:
evaluator = ClusteringEvaluator(featuresCol='pca')
silhouette = evaluator.evaluate(df_kmeans)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.7350327254965526


"The silhouette value is a measure of how similar an object is to its own cluster (cohesion) compared to other clusters (separation). The silhouette ranges from −1 to +1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. If most objects have a high value, then the clustering configuration is appropriate. If many points have a low or negative value, then the clustering configuration may have too many or too few clusters."

In [10]:
first = df_kmeans.filter(df_kmeans['prediction']==0)
second = df_kmeans.filter(df_kmeans['prediction']==1)
third = df_kmeans.filter(df_kmeans['prediction']==2)
fourth = df_kmeans.filter(df_kmeans['prediction']==3)
fifth = df_kmeans.filter(df_kmeans['prediction']==4)
sixth = df_kmeans.filter(df_kmeans['prediction']==5)

In [11]:
max_pos = df_kmeans.select('positive').distinct().select(max('positive')).rdd.map(lambda x:x[0]).collect()[0]
max_neg = df_kmeans.select('negative').distinct().select(max('negative')).rdd.map(lambda x:x[0]).collect()[0]
max_lon = df_kmeans.select('lonely').distinct().select(max('lonely')).rdd.map(lambda x:x[0]).collect()[0]
max_ene = df_kmeans.select('energy').distinct().select(max('energy')).rdd.map(lambda x:x[0]).collect()[0]

In [12]:
dfs = [first, second, third, fourth, fifth, sixth]
moods = ['positive', 'negative', 'lonely', 'energy']
max_moods = [max_pos, max_neg, max_lon, max_ene]

def return_moods(dfs, columns, df_idx=None, mood_idx=None, across_groups=True):
    if across_groups:
        for idx, df in enumerate(dfs):
            print(f'Group {idx+1}')
            df.select(round(sum(df[moods[mood_idx]])/(df.count()*max_moods[mood_idx])*100,3).alias(f'{moods[mood_idx]} %')).show()
    else:
        for idx, col in enumerate(columns):
            print(f'Group {df_idx+1}')
            dfs[df_idx].select(round(sum(dfs[df_idx][col])/(dfs[df_idx].count()*max_moods[idx])*100,3).alias(f'{col} %')).show()

### Returning Mood Percentages Across Group

In [13]:
return_moods(dfs, moods, None, 0)

Group 1
+----------+
|positive %|
+----------+
|      75.0|
+----------+

Group 2
+----------+
|positive %|
+----------+
|    85.417|
+----------+

Group 3
+----------+
|positive %|
+----------+
|    83.333|
+----------+

Group 4
+----------+
|positive %|
+----------+
|    66.071|
+----------+

Group 5
+----------+
|positive %|
+----------+
|    89.655|
+----------+

Group 6
+----------+
|positive %|
+----------+
|    78.922|
+----------+



In [14]:
return_moods(dfs, moods, None, 1)

Group 1
+----------+
|negative %|
+----------+
|    31.111|
+----------+

Group 2
+----------+
|negative %|
+----------+
|     26.25|
+----------+

Group 3
+----------+
|negative %|
+----------+
|      26.0|
+----------+

Group 4
+----------+
|negative %|
+----------+
|    33.571|
+----------+

Group 5
+----------+
|negative %|
+----------+
|    20.345|
+----------+

Group 6
+----------+
|negative %|
+----------+
|     25.49|
+----------+



In [15]:
return_moods(dfs, moods, None, 2)

Group 1
+--------+
|lonely %|
+--------+
|  35.556|
+--------+

Group 2
+--------+
|lonely %|
+--------+
|  34.167|
+--------+

Group 3
+--------+
|lonely %|
+--------+
|  29.333|
+--------+

Group 4
+--------+
|lonely %|
+--------+
|    40.0|
+--------+

Group 5
+--------+
|lonely %|
+--------+
|  25.862|
+--------+

Group 6
+--------+
|lonely %|
+--------+
|  31.765|
+--------+



In [16]:
return_moods(dfs, moods, None, 3)

Group 1
+--------+
|energy %|
+--------+
|  36.111|
+--------+

Group 2
+--------+
|energy %|
+--------+
|    37.5|
+--------+

Group 3
+--------+
|energy %|
+--------+
|  33.333|
+--------+

Group 4
+--------+
|energy %|
+--------+
|  14.286|
+--------+

Group 5
+--------+
|energy %|
+--------+
|  22.414|
+--------+

Group 6
+--------+
|energy %|
+--------+
|   25.49|
+--------+



### Returning Percentages for Each Group

In [17]:
return_moods(dfs, moods, 0, None, False)

Group 1
+----------+
|positive %|
+----------+
|      75.0|
+----------+

Group 1
+----------+
|negative %|
+----------+
|    31.111|
+----------+

Group 1
+--------+
|lonely %|
+--------+
|  35.556|
+--------+

Group 1
+--------+
|energy %|
+--------+
|  36.111|
+--------+



In [18]:
return_moods(dfs, moods, 1, None, False)

Group 2
+----------+
|positive %|
+----------+
|    85.417|
+----------+

Group 2
+----------+
|negative %|
+----------+
|     26.25|
+----------+

Group 2
+--------+
|lonely %|
+--------+
|  34.167|
+--------+

Group 2
+--------+
|energy %|
+--------+
|    37.5|
+--------+



In [19]:
return_moods(dfs, moods, 2, None, False)

Group 3
+----------+
|positive %|
+----------+
|    83.333|
+----------+

Group 3
+----------+
|negative %|
+----------+
|      26.0|
+----------+

Group 3
+--------+
|lonely %|
+--------+
|  29.333|
+--------+

Group 3
+--------+
|energy %|
+--------+
|  33.333|
+--------+



In [20]:
return_moods(dfs, moods, 3, None, False)

Group 4
+----------+
|positive %|
+----------+
|    66.071|
+----------+

Group 4
+----------+
|negative %|
+----------+
|    33.571|
+----------+

Group 4
+--------+
|lonely %|
+--------+
|    40.0|
+--------+

Group 4
+--------+
|energy %|
+--------+
|  14.286|
+--------+



In [21]:
# most positive, least negative, least lonely, mediocre energy
return_moods(dfs, moods, 4, None, False)

Group 5
+----------+
|positive %|
+----------+
|    89.655|
+----------+

Group 5
+----------+
|negative %|
+----------+
|    20.345|
+----------+

Group 5
+--------+
|lonely %|
+--------+
|  25.862|
+--------+

Group 5
+--------+
|energy %|
+--------+
|  22.414|
+--------+



In [22]:
# least positive, most negative, most lonely, least energy
return_moods(dfs, moods, 5, None, False)

Group 6
+----------+
|positive %|
+----------+
|    78.922|
+----------+

Group 6
+----------+
|negative %|
+----------+
|     25.49|
+----------+

Group 6
+--------+
|lonely %|
+--------+
|  31.765|
+--------+

Group 6
+--------+
|energy %|
+--------+
|   25.49|
+--------+



In [88]:
## Ranking Groups

# Ascending order ranks (lowest to highest)
def return_mood_ranks(dfs, columns, df_idx=None, mood_idx=None):
    mood_dict = {}
    for idx, df in enumerate(dfs):
        score = df.select(round(sum(df[moods[mood_idx]])/(df.count()*max_moods[mood_idx])*100,3).alias(f'{moods[mood_idx]} %')).toPandas().iloc[0][0]
        mood_dict.update({f'Group {idx+1}' : score})
    return (sorted(mood_dict, key=lambda x: mood_dict[x]))

def print_ranks(ranks):
    for i,rank in enumerate(ranks):
        print(f'{i+1}. {rank}')

def ranks_total():
    pos_ranks_des = return_mood_ranks(dfs, moods, df_idx=None, mood_idx=0)
    pos_ranks_des.reverse()
    neg_ranks_des = return_mood_ranks(dfs, moods, df_idx=None, mood_idx=1)
    lon_ranks_des = return_mood_ranks(dfs, moods, df_idx=None, mood_idx=2)
    ene_ranks_des = return_mood_ranks(dfs, moods, df_idx=None, mood_idx=3)
    ene_ranks_des.reverse()
    print('Positive ranks in descending order:')
    print_ranks(pos_ranks_des)
    print()
    print('Negative ranks in ascending order:')
    print_ranks(neg_ranks_des)
    print()
    print('Lonely ranks in ascending order')
    print_ranks(lon_ranks_des)
    print()
    print('Energy ranks in descending order')
    print_ranks(ene_ranks_des)

ranks_total()

Group 4 is always ranked #6, they have the least positive moods, most negative moods, most lonely moods, and least energy.

## Investigating Race

In [90]:
df_kmeans.select('RaceW').distinct().collect()

[Row(RaceW=1), Row(RaceW=0)]

- RaceW - White or Caucasian
- RaceAA - Black or African American
- RaceAI - American Indian or Alaska Native
- RaceA - Asian

In [95]:
df_kmeans.filter(df_kmeans['RaceW'] == 1).count()

206

In [93]:
df_kmeans.count()

213

Not enough representation.

## Education

In [142]:
group_4_education = df_kmeans.filter(df_kmeans['prediction'] == 3).groupBy('Education').count().orderBy('Education', ascending=False).toPandas()

group_4_education['Education'] = ['Master\'s', 'Bachelor\'s', 'Some college (no degree)', 'High school diploma']

group_4_education

group_5_education = df_kmeans.filter(df_kmeans['prediction'] == 4).groupBy('Education').count().orderBy('Education', ascending=False).toPandas()

group_5_education['Education'] = ['Doctorate', 'Professional (e.g. MD)', 'Master\'s', 'Bachelor\'s', 'Associate\'s', 'Some college (no degree)', 'High school diploma']

group_5_education

In [120]:
df_kmeans.columns

['COVPDDiag',
 'COVSympInfect',
 'COVProDiag',
 'COVTest',
 'COVFever',
 'COVCough',
 'COVSympChill',
 'COVSympSweat',
 'COVSympShortBreath',
 'COVSympChestTight',
 'COVSympChestPain',
 'COVSympTired',
 'COVSympSleepy',
 'COVSympSoreThroat',
 'COVSympCongest',
 'COVSympMuscleAche',
 'COVSympJointPain',
 'COVSympLoseSmell',
 'COVSympLoseTase',
 'COVSympLoseApetite',
 'COVSympDiarrhea',
 'COVSympStomachPain',
 'COVSympNausea',
 'COVSympVomit',
 'COVSympLightHead',
 'COVSelfIsolate',
 'COVLivingSituation',
 'COVRegion',
 'COVSocialDistancing',
 'COVAlterPhysTher',
 'COVAlterSpeechTher',
 'COVAlterOccTher',
 'COVAlterMenHealth',
 'COVAlterSpprtGrp',
 'COVAlterExercise',
 'COVAlterSeeFamily',
 'COVAlterSeeFriend',
 'COVAlterCommunAct',
 'COVAlterReligAct',
 'COVAlterVolntrAct',
 'COVAlterClinVisit',
 'COVAlterOthMedApt',
 'COVSocDisWork',
 'COVSocDisHouseFinance',
 'COVRschAttdtChange',
 'COVRschPartcpLikely',
 'fox_insight_id',
 'age',
 'Sex',
 'RaceW',
 'RaceAA',
 'RaceAI',
 'RaceA',
 'Et

In [145]:
df_kmeans.select('COVAlterExercise').distinct().show()

+----------------+
|COVAlterExercise|
+----------------+
|               1|
|               3|
|               4|
|               2|
+----------------+



In [146]:
df_kmeans.groupBy('COVAlterExercise').count().show()

+----------------+-----+
|COVAlterExercise|count|
+----------------+-----+
|               1|   42|
|               3|   66|
|               4|   84|
|               2|   21|
+----------------+-----+



In [152]:
df_kmeans.filter(df_kmeans['prediction'] == 4).groupBy('COVAlterExercise').count().show()

+----------------+-----+
|COVAlterExercise|count|
+----------------+-----+
|               1|   11|
|               3|   20|
|               4|   21|
|               2|    6|
+----------------+-----+



In [25]:
# sc.stop()
# ss.stop()