In [0]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go



In [0]:
%fs ls dbfs:/FileStore/tables/

path,name,size,modificationTime
dbfs:/FileStore/tables/Connections-1.csv,Connections-1.csv,38624,1677821868000
dbfs:/FileStore/tables/Connections.csv,Connections.csv,38624,1677821802000
dbfs:/FileStore/tables/Fish-1.csv,Fish-1.csv,6022,1670213279000
dbfs:/FileStore/tables/Fish-2.csv,Fish-2.csv,6022,1670214072000
dbfs:/FileStore/tables/Fish.csv,Fish.csv,6022,1670213190000
dbfs:/FileStore/tables/example.txt,example.txt,1460,1670214635000
dbfs:/FileStore/tables/sample.json,sample.json,155,1670296394000


In [0]:
rdd = sc.textFile('/FileStore/tables/Connections.csv')
rdd.collect()

Out[12]: ['Notes:',
 '"When exporting your connection data, you may notice that some of the email addresses are missing. You will only see email addresses for connections who have allowed their connections to see or download their email address using this setting https://www.linkedin.com/psettings/privacy/email. You can learn more here https://www.linkedin.com/help/linkedin/answer/261"',
 '',
 'First Name,Last Name,Email Address,Company,Position,Connected On',
 'soniya,k,,NCS Group,Senior Developer,31 Jan 2023',
 'Catherine,Dignadice,catherine.dignadice@experis.com.sg,Experis Singapore,Senior Recruitment Consultant,30 Jan 2023',
 'Manraj,Kaur,,Maybank,Portfolio Risk Manager,27 Jan 2023',
 'Pritish,Mishra,,EY,Engagement Manager,14 Jan 2023',
 'Sushma,Kotturu,,IQVIA India,India- Statistical Programmer,11 Jan 2023',
 'Shree ,J S,,Valeo,Asean Corporate Recruiter,02 Jan 2023',
 'Manikanta,Kedarisetti,,"Optimum Solutions (S) Pte Ltd, Singapore",System Analyst,23 Dec 2022',
 'Liesel,Mei,,Spri

In [0]:
res=rdd.zipWithIndex().filter(lambda x:x[1] > 2).map(lambda x:x[0]) #Skip first 3 lines in the file
df = spark.read.csv(res,header=True)


In [0]:
df.show(5,truncate=False)

+----------+---------+----------------------------------+-----------------+-----------------------------+------------+
|First Name|Last Name|Email Address                     |Company          |Position                     |Connected On|
+----------+---------+----------------------------------+-----------------+-----------------------------+------------+
|soniya    |k        |null                              |NCS Group        |Senior Developer             |31 Jan 2023 |
|Catherine |Dignadice|catherine.dignadice@experis.com.sg|Experis Singapore|Senior Recruitment Consultant|30 Jan 2023 |
|Manraj    |Kaur     |null                              |Maybank          |Portfolio Risk Manager       |27 Jan 2023 |
|Pritish   |Mishra   |null                              |EY               |Engagement Manager           |14 Jan 2023 |
|Sushma    |Kotturu  |null                              |IQVIA India      |India- Statistical Programmer|11 Jan 2023 |
+----------+---------+--------------------------

In [0]:
df.select("Connected On").show(5)

+------------+
|Connected On|
+------------+
| 31 Jan 2023|
| 30 Jan 2023|
| 27 Jan 2023|
| 14 Jan 2023|
| 11 Jan 2023|
+------------+
only showing top 5 rows



In [0]:
#Convert Spark dataframe to Pandas dataframe
pandas_df = df.toPandas()

In [0]:
print(pandas_df[0:3])

  First Name  Last Name                       Email Address  \
0     soniya          k                                None   
1  Catherine  Dignadice  catherine.dignadice@experis.com.sg   
2     Manraj       Kaur                                None   

             Company                       Position Connected On  
0          NCS Group               Senior Developer  31 Jan 2023  
1  Experis Singapore  Senior Recruitment Consultant  30 Jan 2023  
2            Maybank         Portfolio Risk Manager  27 Jan 2023  


In [0]:
#Group and Sort the data by company
df_by_company = pandas_df.groupby(by="Company").count().reset_index().sort_values(by="First Name",ascending=False).reset_index(drop=True)

In [0]:
print(df_by_company[0:5])

                     Company  First Name  Last Name  Email Address  Position  \
0  Tata Consultancy Services          14         14              0        14   
1                        SAS          14         14              0        14   
2                  Symbiance          10         10              0        10   
3                         EY           8          8              0         8   
4                  NCS Group           7          7              1         7   

   Connected On  
0            14  
1            14  
2            10  
3             8  
4             7  


In [0]:
# Create a bar plot for the top companies
fig1 = px.bar(df_by_company[:20],
              x="Company",
              y="First Name",
              labels={"First Name": "Count"},
              title="Top Companies/Organizations in my Network")


In [0]:
fig1.show()

In [0]:
# Create a treemap for the top companies
fig2 = px.treemap(df_by_company[:100], path=["Company", "Position"],
                 values="First Name",
                 labels={"First Name": "Count"})

In [0]:
fig2.show()