# Visualize the Reduced Data (20 points)

#### imports

In [52]:
import ex1_functions as ex1
import plotly.express as px

#### Compare Cities:
* Aggregate the data so that each row represents a city, and each column represents the total number of votes obtained by each party.
* Remove the data for parties that received less than 1000 total votes.
* Create a scatter plot of the reduced data (for num_components = 2) to visualize dimensionality reduction.
* Estimate the number of clusters visually.


In [53]:
kneset25_df = ex1.load_data("knesset_25.xlsx")

# Aggregate the data so that each row represents a city, and each column represents the total number of votes obtained by each party.
agg_df = ex1.group_and_aggregate_data(kneset25_df.drop(['ballot_code'],axis='columns'),'city_name',"sum")
agg_df.head(5)

Unnamed: 0,city_name,party_avoda,party_shahar_kalkali_hadash,party_bayit_yehudi,party_agudat_israel,party_daled,party_vavmem,party_shahar_koach_hevrati,party_kama,party_koach_lehashpia,...,party_tze'irim_bo'arim,party_manhigut_hevratit,party_kol_hasviva_vehachai,party_halev_hayehudi,party_seder_chadash,party_kol,party_beometz_bishvilech,party_kavod_umasoret,party_shas,party_daat_tov_vera
0,אבו גווייעד שבט,1,0,0,0,4,38,0,0,1,...,1,0,0,0,0,0,0,0,4,3
1,אבו גוש,14,1,1,3,1263,312,0,0,0,...,2,7,1,0,1,1,3,0,4,0
2,אבו סנאן,34,0,3,0,677,2030,4,1,2,...,1,4,1,3,1,6,9,0,12,1
3,אבו עבדון שבט,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,אבו קורינאת שבט,5,0,1,0,10,65,0,0,1,...,0,1,0,0,2,1,0,0,3,0


In [54]:
# Remove the data for parties that received less than 1000 total votes.
filtered_df = ex1.remove_sparse_columns(agg_df,1000) 
filtered_df.head(5)

Unnamed: 0,city_name,party_avoda,party_shahar_kalkali_hadash,party_bayit_yehudi,party_agudat_israel,party_daled,party_vavmem,party_tet,party_israel_hofshit_demokratit,party_hakalkalit_hahadasha,...,party_raam,party_yesh_atid,party_pesofit,party_tze'irim_bo'arim,party_kol_hasviva_vehachai,party_seder_chadash,party_kol,party_beometz_bishvilech,party_shas,party_daat_tov_vera
0,אבו גווייעד שבט,1,0,0,0,4,38,0,0,0,...,468,1,0,1,0,0,0,0,4,3
1,אבו גוש,14,1,1,3,1263,312,13,0,0,...,838,26,0,2,1,1,1,3,4,0
2,אבו סנאן,34,0,3,0,677,2030,6,4,0,...,1160,163,0,1,1,1,6,9,12,1
3,אבו עבדון שבט,0,0,0,0,1,1,0,0,0,...,39,0,0,0,0,0,0,0,0,0
4,אבו קורינאת שבט,5,0,1,0,10,65,0,0,0,...,1096,4,1,0,0,2,1,0,3,0


In [57]:
pca2_df = ex1.dimensionality_reduction(filtered_df,2,['city_name'])
# Create a scatter plot of the reduced data (for num_components = 2) to visualize dimensionality reduction.
fig = px.scatter(pca2_df, 
                 x='PC1', 
                 y='PC2', 
                 color='city_name', 
                 title="KNESET25 PCA BY CITY", 
                 labels={"PC1": "Principal Component 1", "PC2": "Principal Component 2"},
                 hover_data={'city_name': True, 'PC1': True, 'PC2': True},  
                 template="plotly_dark") 

fig.show()


In [62]:
# just for fun...clusters by k-means:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)  # k=3
pca2_df['Cluster'] = kmeans.fit_predict(pca2_df[['PC1', 'PC2']])

fig = px.scatter(pca2_df, 
                 x='PC1', 
                 y='PC2', 
                 color='Cluster',  # Color by the cluster
                 hover_data={'city_name': True, 'PC1': True, 'PC2': True, 'Cluster': True},
                 title="PCA with Clusters",
                 labels={"PC1": "Principal Component 1", "PC2": "Principal Component 2"},
                 template="plotly_dark")

fig.show()

#### Compare Parties:
* Transpose the city-wide data so that each row represents a party, and each column represents a city.
* Remove data for cities with fewer than 1000 total votes.
* Create a scatter plot of the reduced data (for num_components = 2) to visualize dimensionality reduction.
* Estimate the number of clusters visually.


In [64]:
#aggrgate by city, remove ballot_code.
agg_df = ex1.group_and_aggregate_data(kneset25_df.drop(['ballot_code'],axis='columns'),'city_name',"sum")

#transpose the data
df_transposed = agg_df.set_index('city_name').T

#reset index after transpose
df_transposed = df_transposed.reset_index()

#fix col names
df_transposed.rename(columns={'index': 'party_name'}, inplace=True)

df_transposed.head(5)

city_name,party_name,אבו גווייעד שבט,אבו גוש,אבו סנאן,אבו עבדון שבט,אבו קורינאת שבט,אבו קרינאת יישוב,אבו רובייעה שבט,אבו רוקייק שבט,אבו תלול,...,תלמים,תמרת,תנובות,תעוז,תפרח,תקומה,תקוע,תראבין אצאנע שבט,תראבין אצאנעישוב,תרום
0,party_avoda,1,14,34,0,5,2,10,0,0,...,0,120,18,16,1,3,25,0,1,5
1,party_shahar_kalkali_hadash,0,1,0,0,0,0,0,0,0,...,3,3,1,3,4,2,18,0,0,4
2,party_bayit_yehudi,0,1,3,0,1,0,0,0,0,...,4,7,3,8,0,42,266,0,0,17
3,party_agudat_israel,0,3,0,0,0,1,5,0,0,...,7,1,4,3,670,1,13,0,0,12
4,party_daled,4,1263,677,1,10,6,24,35,6,...,0,3,0,1,0,0,0,0,6,0


In [65]:
# Remove the data for parties that received less than 1000 total votes.
filtered_tdf = ex1.remove_sparse_columns(df_transposed,1000)

print(f"before filter, df shape was:{df_transposed.shape}, after filter, shape is:{filtered_tdf.shape}", end='\n\n')
print(f"{df_transposed.shape[1]-filtered_tdf.shape[1]} columns were removed:")
for col in df_transposed.columns.tolist():
    if col not in filtered_tdf.columns.tolist():
        print(f"\t{col}", end="")
        print(" "*(30-len(col)),end="")
        print(f"\t\t {df_transposed[col].sum()}")

before filter, df shape was:(40, 1217), after filter, shape is:(40, 266)

951 columns were removed:
	אבו גווייעד שבט               		 540
	אבו עבדון שבט                 		 41
	אבו קרינאת יישוב              		 346
	אבו תלול                      		 647
	אבטין                         		 867
	אבטליון                       		 244
	אביאל                         		 402
	אביבים                        		 270
	אביגדור                       		 483
	אביחיל                        		 694
	אביטל                         		 327
	אביעזר                        		 496
	אבירים                        		 150
	אבן מנחם                      		 194
	אבן ספיר                      		 368
	אבן שמואל                     		 904
	אבני איתן                     		 324
	אבשלום                        		 196
	אדורה                         		 207
	אדירים                        		 176
	אדמית                         		 205
	אדרת                          		 443
	אודים                         		 614
	אודם                      

In [66]:
pca2_df = ex1.dimensionality_reduction(filtered_tdf,2,['party_name'])
# hanfdel complex numbers- convert them:
pca2_df['PC1'] = pca2_df['PC1'].apply(lambda x: x.real if isinstance(x, complex) else x)
pca2_df['PC2'] = pca2_df['PC2'].apply(lambda x: x.real if isinstance(x, complex) else x)


# Create a scatter plot of the reduced data (for num_components = 2) to visualize dimensionality reduction.
fig = px.scatter(pca2_df, 
                 x='PC1', 
                 y='PC2', 
                 color='party_name', 
                 title="KNESET25 PCA BY PARTY", 
                 labels={"PC1": "Principal Component 1", "PC2": "Principal Component 2"},
                 hover_data={'party_name': True, 'PC1': True, 'PC2': True},  
                 template="plotly_dark") 

fig.show()

In [69]:
# just for fun...clusters by k-means:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2)  # k=2
pca2_df['Cluster'] = kmeans.fit_predict(pca2_df[['PC1', 'PC2']])

fig = px.scatter(pca2_df, 
                 x='PC1', 
                 y='PC2', 
                 color='Cluster',  # Color by the cluster
                 hover_data={'party_name': True, 'PC1': True, 'PC2': True, 'Cluster': True},
                 title="PCA with Clusters",
                 labels={"PC1": "Principal Component 1", "PC2": "Principal Component 2"},
                 template="plotly_dark")

fig.show()