In [1]:
import pyspark
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [2]:
#Let us create our spark session
spark = SparkSession.builder.getOrCreate()

In [None]:
#Using our boto3 client to download our files from s3 bucket
import boto3

s3 = boto3.client('s3') # creating the boto3 to be able to access the file from the s3 bucket

s3.download_file('blossom-data-engs', 'all-us-stocks-tickers-company-info-logos.zip', 'us_stocks.zip')
s3.download_file('blossom-data-engs', 'data-scientist-job-market-in-the-us.zip', 'ds_jobs.zip')

In [61]:
#Now let us read our downloaded dataset with spark.
companies = spark.read.csv("companies.csv", header=True, inferSchema=True,multiLine=True)
alldata = spark.read.csv("alldata.csv", header=True, inferSchema=True, multiLine=True)

In [62]:
#Let rename 'description' column since it appears in both.
companies = companies.withColumnRenamed('description','company_description')
alldata = alldata.withColumnRenamed('description','job_description')

In [63]:
#Time to merge the two data sets with an inner join.
merged_data = companies.join(alldata, alldata['company'] == companies['company name'])

In [65]:
#Let us now extract city from our location column
merged_data = merged_data.select('*', F.split(alldata['location'], ',')[0].alias('city'))

In [67]:
#Importing NGRAM and Tokenizer to be used to generate ngrams.
from pyspark.ml.feature import NGram, Tokenizer

In [68]:
#Our function to generate an ngram from a particular dataframe and column
def create_ngram(df, col):
    tokens = Tokenizer(inputCol=col, outputCol='tokens') # create tokens from the data on the col column
    new_df = tokens.transform(df)   ## apply the tokenizer on the dataset
    ngram = NGram(n=2, inputCol='tokens', outputCol='ngrams') # creating the ngram object
    new_df = ngram.transform(new_df)  #transform the df with the ngram
    return new_df


In [86]:
#Time to call our function.
new_data = create_ngram(merged_data, 'company_description')

In [85]:
new_data.select(['location','city','ngrams']).show(2)

+----------+------+--------------------+
|  location|  city|              ngrams|
+----------+------+--------------------+
|Austin, TX|Austin|[cubic corp, corp...|
|Austin, TX|Austin|[the hershey, her...|
+----------+------+--------------------+
only showing top 2 rows



In [87]:
#A function to implement frequency count on a given column in a dataframe.
def create_freq_df(df, col):
    n=df.select(['ngrams',col]).select(col, F.explode('ngrams').alias('ngrams')).groupby([col, 'ngrams']).count() #exploding the ngrams
    n = n.withColumnRenamed('count','frequency') # changing the column name from count to frequency
    n = n.orderBy(n['frequency'].desc()) # ordering rows by biggest first.
    return n


In [88]:
#Calling our frequency function on the industry ngrams
industry_freq_df = create_freq_df(new_data, 'industry')

In [89]:
#Calling our frequency function on the city ngrams
city_freq_df = create_freq_df(new_data,'city')

In [98]:
industry_freq_df.describe().show()

+-------+--------------------+-----------------+-----------------+
|summary|            industry|           ngrams|        frequency|
+-------+--------------------+-----------------+-----------------+
|  count|                 702|              702|              702|
|   mean|                null|             null|5.249287749287749|
| stddev|                null|             null|9.044617147747154|
|    min| Aerospace & Defense|       & casualty|                1|
|    max|Retail - Apparel ...|www.ebay.com, its|               52|
+-------+--------------------+-----------------+-----------------+



In [83]:
city_freq_df.show()

+---------+-----------------+---------+
|     city|           ngrams|frequency|
+---------+-----------------+---------+
|Cambridge|       ability to|      130|
|Cambridge|           in the|      130|
|Cambridge|           of the|      104|
|Cambridge|    experience in|       93|
|Cambridge|           to the|       85|
|San Diego|           in the|       82|
|Cambridge|             in a|       79|
|Cambridge|          and the|       78|
|Cambridge|             as a|       67|
|Cambridge|          to work|       65|
|Cambridge|  experience with|       65|
|Cambridge|equal opportunity|       64|
|Cambridge|        sanofi is|       59|
|Cambridge|          well as|       58|
|Cambridge|          as well|       57|
|Cambridge|     committed to|       56|
|Cambridge|        regard to|       55|
|Cambridge|          will be|       55|
|San Diego|           of the|       54|
|Cambridge|        pfizer is|       48|
+---------+-----------------+---------+
only showing top 20 rows

