In [1]:
import os
import pyspark
import boto3
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark import SparkContext

In [2]:
#create spark session
spark = SparkSession.builder.getOrCreate()

In [3]:
#Initializing s3 resource
s3 = boto3.client('s3')

In [11]:
#Downloading data from bucket 
s3.download_file('blossom-data-engs','all-us-stocks-tickers-company-info-logos.zip', 'all-us-data.zip')
s3.download_file('blossom-data-engs', 'data-scientist-job-market-in-the-us.zip', 'all-data-science.zip')

In [4]:
#Using spark to read our data.
companies = spark.read.csv('companies.csv', header=True, inferSchema=True, multiLine=True)
alldata = spark.read.csv('alldata.csv', header=True, inferSchema=True, multiLine=True)

In [6]:
companies.count()

6512

In [7]:
alldata.count()

13513

In [10]:
companies.show(2)

+------+--------------------+----------+--------------------+--------------------+--------------------+------+--------------------+--------------------+-----------+---------------+---------------+--------------------+--------------------+
|ticker|        company name|short name|            industry|         description|             website|  logo|                 ceo|            exchange| market cap|         sector|          tag 1|               tag 2|              tag 3|
+------+--------------------+----------+--------------------+--------------------+--------------------+------+--------------------+--------------------+-----------+---------------+---------------+--------------------+--------------------+
|     A|Agilent Technolog...|   Agilent|Medical Diagnosti...|Agilent Technolog...|http://www.agilen...| A.png| Michael R. McMullen|New York Stock Ex...|24218068096|     Healthcare|     Healthcare|Diagnostics & Res...|Medical Diagnosti...|
|    AA|   Alcoa Corporation|     Alcoa|    

In [11]:
alldata.show(2)

+--------------------+------------------+--------------------+-------+------------------+
|            position|           company|         description|reviews|         location|
+--------------------+------------------+--------------------+-------+------------------+
|Development Director|           ALS TDI|Development Direc...|   null|Atlanta, GA 30301 |
|An Ostentatiously...|The Hexagon Lavish|"Job Description
...|   null|              null|
+--------------------+------------------+--------------------+-------+------------------+
only showing top 2 rows



In [12]:
#Renaming 'description' column for both dataframes since it appears in both and we want to merge.
companies = companies.withColumnRenamed('description', 'company_description')
alldata = alldata.withColumnRenamed('description', 'job_description')

In [13]:
#Merging Companies and AllData with an inner join.
merged_data =companies.join(alldata, alldata['company'] == companies['company name'])

In [14]:
merged_data.show(2)

+------+-------------------+-------------------+--------------------+--------------------+--------------------+----+-------------------+--------------------+-----------+------------------+------------------+-------------------+--------------------+--------------------+-------------------+--------------------+-------+----------+
|ticker|       company name|         short name|            industry| company_description|             website|logo|                ceo|            exchange| market cap|            sector|             tag 1|              tag 2|              tag 3|            position|            company|     job_description|reviews| location|
+------+-------------------+-------------------+--------------------+--------------------+--------------------+----+-------------------+--------------------+-----------+------------------+------------------+-------------------+--------------------+--------------------+-------------------+--------------------+-------+----------+
|   CUB|  

In [24]:
merged_data = merged_data.withColumnRenamed('location\r', 'location')
alldata = alldata.withColumnRenamed('location\r', 'location')

In [27]:
#Extracting city from location column.
merged_data = merged_data.select('*', F.split(merged_data['location'], ',')[0].alias('city'))

In [28]:
#Import NGRAM and Tokenizer to be used to generate ngrams
from pyspark.ml.feature import NGram, Tokenizer

In [31]:
#Function to generate ngram(s) from a particular column in a dataframe

def create_ngram(df, col):
    #Generate tokens as a new column on the dataframe given as a parameter
    tokens = Tokenizer(inputCol = col, outputCol = 'tokens')
    
    #Apply the tokenizer on the dataframe
    new_df = tokens.transform(df)
    
    #Generating an NGRAM column where n = 2 in this example
    ngram = NGram(n = 2, inputCol = 'tokens', outputCol = 'ngrams')
    
    #Apply the ngram on the dataframe
    new_df = ngram.transform(new_df) 
    
    return new_df

In [32]:
#Calling our function on the merged_data and using 'company_description' as column
new_data = create_ngram(merged_data, 'company_description')

In [34]:
#Let's see the results after the function has been applied.
new_data.select(['location', 'city', 'ngrams']).show(3)

+----------+------+--------------------+
|  location|  city|              ngrams|
+----------+------+--------------------+
|Austin, TX|Austin|[cubic corp, corp...|
|Austin, TX|Austin|[the hershey, her...|
|Austin, TX|Austin|[abbott laborator...|
+----------+------+--------------------+
only showing top 3 rows



In [35]:
#Creating a function to implement frequency count on a given column in a dataframe object.

def create_freq_df(df, col):
    #Exploding the ngrams columns to count each ngram generated on a column
    freq_count = df.select(col, F.explode('ngrams').alias('ngrams')).groupby([col, 'ngrams']).count()
    
    #Changing the column name from 'count' (which is default) to frequency
    freq_count = freq_count.withColumnRenamed('count', 'frequency')
    
    #Ordering count in descending order.
    freq_count = freq_count.orderBy(freq_count['frequency'].desc())
    
    return freq_count

In [37]:
#Calling our frequency function on the industry column 
industry_freq_df = create_freq_df(new_data, 'industry')

In [43]:
#Let's see the results of the above
industry_freq_df.show()

+------------------+--------------------+---------+
|          industry|              ngrams|frequency|
+------------------+--------------------+---------+
|     Biotechnology|                is a|       52|
|Drug Manufacturers|        and vaccines|       47|
|     Biotechnology|             corp is|       42|
|     Biotechnology|            to treat|       41|
|     Biotechnology|  and commercializes|       36|
|     Biotechnology|    istodax, otezla,|       34|
|     Biotechnology|      its registered|       34|
|     Biotechnology|   registered brands|       34|
|     Biotechnology| pomalyst, revlimid,|       34|
|     Biotechnology|         designed to|       34|
|     Biotechnology|  abraxane, istodax,|       34|
|     Biotechnology|        celgene corp|       34|
|     Biotechnology|discovers, develops,|       34|
|     Biotechnology|        treat cancer|       34|
|     Biotechnology|and immune-inflam...|       34|
|     Biotechnology|        company that|       34|
|     Biotec

In [38]:
#Calling our frequency function on city columns
city_freq_df = create_freq_df(new_data, 'city')

In [42]:
#Let's see the result of the above as well
city_freq_df.show(5)

+---------+--------------------+---------+
|     city|              ngrams|frequency|
+---------+--------------------+---------+
|Cambridge|        and vaccines|       43|
|Cambridge|              in the|       28|
|Cambridge|                is a|       28|
|Cambridge|          engaged in|       28|
|Cambridge|research, develop...|       27|
+---------+--------------------+---------+
only showing top 5 rows

