# INFO7390 - Advance Data Science and Architecture

## Project Title: Job Recommendation System
### Teammates:
1. Aniruddha Tambe
2. Shubhankar Salvi
3. Sangram Vuppula

## Importing packages:

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import missingno as msno
from icecream import ic
import time
import os
import re
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.figure_factory as ff
from kaleido.scopes.plotly import PlotlyScope
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest
import warnings
warnings.filterwarnings(action='ignore')
pd.options.display.max_columns = 100

# plotly settings and functions
scope = PlotlyScope(plotlyjs="https://cdn.plot.ly/plotly-latest.min.js")
pio.templates.default = 'plotly_white'

## Dataset:

1. Stack Overflow 2018 Developer Survey - Individual responses on the 2018 Developer Survey fielded by Stack Overflow

https://www.kaggle.com/stackoverflow/stack-overflow-2018-developer-survey#survey_results_public.csv

2. U.S. Technology Jobs on Dice.com - 22,000 US-based Technology Job Listings

https://www.kaggle.com/PromptCloudHQ/us-technology-jobs-on-dicecom

In [2]:
survey = pd.read_csv("./dataset/survey_results_public.csv")

## Preliminary data summary

In [3]:
# List all column names
all_col_names = survey.columns.values.tolist()
print('Number of columns: ',len(all_col_names))
#print(all_col_names)

Number of columns:  129


In [4]:
# Get categorical columns
cat_cols = [col for col in survey.columns if survey[col].dtype.name=="object"]
float_cols = [col for col in survey.columns if survey[col].dtype.name=="float64"]
int_cols = [col for col in survey.columns if survey[col].dtype.name=="int64"]
print('Number of categorical columns: ',len(cat_cols))
print('Number of float columns: ',len(float_cols))
print('Number of int columns: ',len(int_cols))

Number of categorical columns:  87
Number of float columns:  41
Number of int columns:  1


## Findings

1. Number of categorical columns:  87
2. Number of float columns:  41
3. Number of int columns:  1
4. Datatypes found: float64(41), int64(1), object(87)
5. Rows x Columns: 98855 x 129

## Dropping irrelevant columns

In [5]:
attrToDrop=["SurveyTooLong","SurveyEasy","AdBlocker","AdBlockerDisable","AdBlockerReasons","AdsAgreeDisagree1","AdsAgreeDisagree2","AdsAgreeDisagree3","AdsActions","AdsPriorities1","AdsPriorities2","AdsPriorities3","AdsPriorities4","AdsPriorities5","AdsPriorities6","AdsPriorities7","AIDangerous","AIInteresting","AIResponsible","AIFuture","EthicsChoice","EthicsReport","EthicsResponsible","EthicsImplications","EthicsChoice","StackOverflowRecommend","StackOverflowVisit","SurveyTooLong","SurveyEasy","AdBlocker","AdBlockerDisable","AdBlockerReasons","AdsAgreeDisagree1","AdsAgreeDisagree2","AdsAgreeDisagree3","AdsActions","AdsPriorities1","AdsPriorities2","AdsPriorities3","AdsPriorities4","AdsPriorities5","AdsPriorities6","AdsPriorities7","AIDangerous","AIInteresting","AIResponsible","AIFuture","EthicsChoice","EthicsReport","EthicsResponsible","EthicsImplications","EthicsChoice","StackOverflowRecommend","StackOverflowVisit","StackOverflowHasAccount","StackOverflowParticipate","StackOverflowJobs","StackOverflowDevStory","StackOverflowJobsRecommend","StackOverflowConsiderMember","HypotheticalTools1","HypotheticalTools2","HypotheticalTools3","HypotheticalTools4","HypotheticalTools5","JobContactPriorities1","JobContactPriorities2","JobContactPriorities3","JobContactPriorities4","JobContactPriorities5","TimeAfterBootcamp","StackOverflowHasAccount","StackOverflowParticipate","StackOverflowJobs","StackOverflowDevStory","StackOverflowJobsRecommend","StackOverflowConsiderMember","HypotheticalTools1","HypotheticalTools2","HypotheticalTools3","HypotheticalTools4","HypotheticalTools5","JobContactPriorities1","JobContactPriorities2","JobContactPriorities3","JobContactPriorities4","JobContactPriorities5","TimeAfterBootcamp","HackathonReasons","ErgonomicDevices","Hobby","NumberMonitors","Salary","CheckInCode","WakeTime","TimeFullyProductive","SkipMeals","HoursOutside","Exercise","EthicalImplications","EducationParents"]
survey=survey[survey.columns.difference(attrToDrop)]
print("Number of columns dropped: ",len(attrToDrop))

Number of columns dropped:  102


In [6]:
# Get categorical columns
cat_cols = [col for col in survey.columns if survey[col].dtype.name=="object"]
float_cols = [col for col in survey.columns if survey[col].dtype.name=="float64"]
int_cols = [col for col in survey.columns if survey[col].dtype.name=="int64"]
print('Number of categorical columns: ',len(cat_cols))
print('Number of float columns: ',len(float_cols))
print('Number of int columns: ',len(int_cols))
print('Total number of columns: ',len(survey.columns))

Number of categorical columns:  43
Number of float columns:  29
Number of int columns:  1
Total number of columns:  73


## Replace NA values with ""

In [7]:
survey = survey.replace(np.nan, '', regex=True)

## Encode categorical features

In [8]:
# Creating dictionaries

Frameworknextyear=list()
for value in survey["FrameworkDesireNextYear"]:
        new=str(value).split(';')
        for i in new:
            Frameworknextyear.append(i)
sFNY=set(Frameworknextyear)

FrameworkWorkedWith=list()
for value in survey["FrameworkWorkedWith"]:
        new=str(value).split(';')
        for i in new:
            FrameworkWorkedWith.append(i)
sFW=set(FrameworkWorkedWith)

PlatformDesireNextYear=list()
for value in survey["PlatformDesireNextYear"]:
        new=str(value).split(';')
        for i in new:
            PlatformDesireNextYear.append(i)
sPDNY=set(PlatformDesireNextYear)

PlatformWorkedWith=list()
for value in survey["PlatformWorkedWith"]:
        new=str(value).split(';')
        for i in new:
            PlatformWorkedWith.append(i)
sPWW=set(PlatformWorkedWith)

LanguageDesireNextYear=list()
for value in survey["LanguageDesireNextYear"]:
        new=str(value).split(';')
        for i in new:
            LanguageDesireNextYear.append(i)
sLDNY=set(LanguageDesireNextYear)

LanguageWorkedWith=list()
for value in survey["LanguageWorkedWith"]:
        new=str(value).split(';')
        for i in new:
            LanguageWorkedWith.append(i)
sLWW=set(LanguageWorkedWith)
         
CommunicationTools=list()
for value in survey["CommunicationTools"]:
        new=str(value).split(';')
        for i in new:
            CommunicationTools.append(i)
sCT=set(CommunicationTools)

DatabaseWorkedWith=list()
for value in survey["DatabaseWorkedWith"]:
        new=str(value).split(';')
        for i in new:
            DatabaseWorkedWith.append(i)
sDWW=set(DatabaseWorkedWith)

DatabaseDesireNextYear=list()
for value in survey["DatabaseDesireNextYear"]:
        new=str(value).split(';')
        for i in new:
            DatabaseDesireNextYear.append(i)
sDNY=set(DatabaseDesireNextYear)

DevType=list()
for value in survey["DevType"]:
        new=str(value).split(';')
        for i in new:
            DevType.append(i)
sDT=set(DevType)

# Combining all frameworks
combined=sFNY.union(sFW,sFNY,sPDNY,sPWW,sLDNY,sLWW,sCT,sDWW,sDNY,sDT)
all_framework=list(combined)
all_fw=pd.DataFrame(all_framework)

In [9]:
# Userwise framework information
FrameworkWorkedWith = pd.DataFrame(survey['Respondent'])
for i in sFW:
    #print(i)
    FrameworkWorkedWith[i]=""

PlatformWorkedWith = pd.DataFrame(survey['Respondent'])
for i in sPWW:
    #print(i)
    PlatformWorkedWith[i]=""

LanguageWorkedWith = pd.DataFrame(survey['Respondent'])
for i in sLWW:
    LanguageWorkedWith[i]=""

DatabaseWorkedWith = pd.DataFrame(survey['Respondent'])
for i in sDWW:
    DatabaseWorkedWith[i]=""

DevType = pd.DataFrame(survey['Respondent'])
for i in sDT:
    DevType[i]=""

CommunicationTools = pd.DataFrame(survey['Respondent'])
for i in sCT:
    CommunicationTools[i]=""

In [10]:
# Forming dictionary

coldic=dict(zip(FrameworkWorkedWith.columns,range(0,len(sFW)+1)))
for i in range(98855):
    #try:
    data=(survey.loc[i,'FrameworkWorkedWith']).split(';')
    #except:
        #print("Error: ",i," - ",survey.loc[i,'FrameworkWorkedWith'])
    if(data[0]!=""):
       # print(data)
        for value in data:
            FrameworkWorkedWith.iloc[i,coldic[value]]=1

#PlatformWorkedWith
coldic1=dict(zip(PlatformWorkedWith.columns,range(0,len(sPWW)+1)))
for i in range(98855):
    data=(survey.loc[i,'PlatformWorkedWith']).split(';')
    if(data[0]!=""):
       # print(data)
        for value in data:
            PlatformWorkedWith.iloc[i,coldic1[value]]=1

#LanguageWorkedWith
coldic2=dict(zip(LanguageWorkedWith.columns,range(0,len(sLWW)+1)))
for i in range(98855):
    data=(survey.loc[i,'LanguageWorkedWith']).split(';')
    if(data[0]!=""):
       # print(data)
        for value in data:
            LanguageWorkedWith.iloc[i,coldic2[value]]=1

#DatabasesWorkedWth
coldic3=dict(zip(DatabaseWorkedWith.columns,range(0,len(sDWW)+1)))
for i in range(98855):
    data=(survey.loc[i,'DatabaseWorkedWith']).split(';')
    if(data[0]!=""):
       # print(data)
        for value in data:
            DatabaseWorkedWith.iloc[i,coldic3[value]]=1

#DevType
coldic4=dict(zip(DevType.columns,range(0,len(sDT)+1)))
for i in range(98855):
    data=(survey.loc[i,'DevType']).split(';')
    if(data[0]!=""):
       # print(data)
        for value in data:
            DevType.iloc[i,coldic4[value]]=1

#CommmunicationTools
coldic5=dict(zip(CommunicationTools.columns,range(0,len(sCT)+1)))
for i in range(98855):
    data=(survey.loc[i,'CommunicationTools']).split(';')
    if(data[0]!=""):
        for value in data:
            CommunicationTools.iloc[i,coldic5[value]]=1

## Export survey to csv

In [12]:
survey.to_csv('./dataset/survey_dropped_columns.csv',index=False)
LanguageWorkedWith.to_csv("./dataset/LanguageWorkedWith.csv",index=False)
DatabaseWorkedWith.to_csv("./dataset/DatabaseWorkedWith.csv",index=False)
CommunicationTools.to_csv("./dataset/CommunicationTools.csv",index=False)
DevType.to_csv("./dataset/DevType.csv",index=False)
FrameworkWorkedWith.to_csv("./dataset/FrameworkWorkedWith.csv",index=False)
PlatformWorkedWith.to_csv("./dataset/PlatformWorkedWith.csv",index=False)

## Encoding categorical features

In [None]:
encode = LabelEncoder()

In [None]:
cat_cols = [col for col in survey.columns if survey[col].dtype.name=="object"]
# Converting the variables
for col in cat_cols:
    print('Converted: ', col)
    survey[col]=encode.fit_transform(survey[col])

## Imputing values

In [None]:
# Imputing assessment columns with mode value
for col in survey.columns: 
    if col.__contains__('Assess'):
        #print('Found: ',col)
        survey[col] = survey[col].fillna(survey[col].mode()[0])