In [8]:
import pandas as pd
import utils
import preprocessing
import factors
from collections import Counter

In [2]:
def load_data(year):
    #import BDD file
    df_y1_y2 = utils.import_BDD(f"data/BDD{year}.csv")
    #import qualtrics file (substituted by an empty file if not avaialable)
    df_y3 = utils.import_qualtrics(f"data/qualtrics{year}.csv")
    #merge the two data sources (BDD and qualtrics)
    df_all = pd.merge(df_y1_y2,df_y3, how="outer", on="bid")
    #import file containing admission codes
    admissions = pd.read_csv("data/admission.csv")
    admissions.dropna(inplace = True)
    admissions.drop("STVATTS_DESC", axis = 1, inplace = True)

    #join the admissions codes with the general df
    df = df_all.merge(admissions, how = "left", left_on = "admission1", right_on = "STVATTS_CODE")
    df.drop("STVATTS_CODE", axis = 1, inplace = True)
    return df

In [3]:
df1 = load_data(2018)
df2 = load_data(2019)
df3 = load_data(2020)

In [5]:
jobs = []

for df in [df1,df2,df3]:
    job_post_y1=df.job_post_y1.apply(preprocessing.preprocess_job_post).values.tolist()
    job_post_y1 = [j for j in job_post_y1 if j==j]

    job_post_y2=df.job_post_y2.apply(preprocessing.preprocess_job_post).values.tolist()
    job_post_y2 = [j for j in job_post_y2 if j==j]

    job_post_y3=df.job_post_y3.apply(preprocessing.preprocess_job_post).values.tolist()
    job_post_y3 = [j for j in job_post_y3 if j==j]

    job_post_now=df.job_post_now.apply(preprocessing.preprocess_job_post).values.tolist()
    job_post_now = [j for j in job_post_now if j==j]
    
    #get a list with all job postings 
    jobs += job_post_y1
    jobs += job_post_y2
    jobs +=job_post_y3
    jobs += job_post_now


In [10]:
len(jobs)

2721

In [9]:
word_count = Counter(" ".join(jobs).split()).most_common(50)
word_frequency = pd.DataFrame(word_count, columns = ['Word', 'Frequency'])
word_frequency.head(20)

Unnamed: 0,Word,Frequency
0,consultant,470
1,analyst,389
2,manager,360
3,associate,243
4,senior,194
5,analyste,153
6,business,139
7,data,113
8,chef,107
9,junior,104


In [11]:
junior_words = ['consultant','analyst','consultante','analyste','junior','graduate']
senior_words = ['manager','senior','associate','chef','responsable','head','lead','specialist','ceo']

In [14]:
words_chosen = word_frequency.loc[word_frequency['Word'].isin(junior_words+senior_words)] 

In [16]:
words_chosen.loc[:,"Percentage"] = words_chosen["Frequency"].apply(lambda x: f"{round(x/len(jobs)*100,2)}%")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [19]:
words_chosen[["Word", "Percentage"]]

Unnamed: 0,Word,Percentage
0,consultant,17.27%
1,analyst,14.3%
2,manager,13.23%
3,associate,8.93%
4,senior,7.13%
5,analyste,5.62%
8,chef,3.93%
9,junior,3.82%
10,consultante,3.49%
17,responsable,2.5%


In [20]:
#tot percentage of job positions covered
words_chosen["Frequency"].apply(lambda x: x/len(jobs)).sum()

0.8566703417861081