## Clean Job Posting data

In [1]:
import pandas as pd

posting = pd.read_csv('job_postings.csv')
posting = posting.drop(['last_processed_time','last_status','first_seen','got_summary','got_ner','is_being_worked','company','search_city','search_position'],axis=1)
print(posting.info())
print(posting.groupby(['job_level','search_country']).size())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12217 entries, 0 to 12216
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   job_link        12217 non-null  object
 1   job_title       12217 non-null  object
 2   job_location    12216 non-null  object
 3   search_country  12217 non-null  object
 4   job_level       12217 non-null  object
 5   job_type        12217 non-null  object
dtypes: object(6)
memory usage: 572.8+ KB
None
job_level   search_country
Associate   Australia           20
            Canada              49
            United Kingdom     115
            United States     1114
Mid senior  Australia          281
            Canada             581
            United Kingdom     880
            United States     9177
dtype: int64


In [2]:
skill = pd.read_csv('job_skills.csv')
df = pd.merge(posting[posting['job_level']=='Associate'],skill,how='left',on='job_link')

df.loc[df['job_title'].str.contains('Data Analyst'), 'job_title'] = 'Data Analyst'
df.loc[df['job_title'].str.contains('Data Engineer'), 'job_title'] = 'Data Engineer'
df.loc[df['job_title'].str.contains('Data Scientist'), 'job_title'] = 'Data Scientist'
df.loc[df['job_title'].str.contains('Machine Learning Engineer'), 'job_title'] = 'Machine Learning Engineer'
df.loc[df['job_title'].str.contains('Data Entry Specialist'), 'job_title'] = 'Data Entry Specialist'

# print(df.groupby(['job_title']).size().sort_values(ascending=False).head(10))

In [3]:

# top_list = ('Data Analyst|Data Engineer|Data Scientist|Machine Learning Engineer')

top_list = ('Data Scientist')
top_df = df.loc[df['job_title'].str.contains(top_list), :]
# print(top_df.info())

skills_df = top_df['job_skills'].apply(lambda x: x.split(', ')).apply(pd.Series)
# print(skills_df.columns)
skills_df = skills_df.fillna('')

# print(skills_df)

cols_count = {}
for row in [skills_df.iloc[i,:].tolist() for i in range(0,len(skills_df))]:
    for entry in row:
        if entry in cols_count.keys():
            cols_count[entry] += 1
        else:
            cols_count[entry] = 1

print(cols_count) 

{'Machine learning': 17, 'Regression': 11, 'Natural Language Processing (NLP)': 1, 'Neural networks': 3, 'Quantitative research methods': 1, 'Statistics': 26, 'Bayesian analysis': 1, 'Pandas': 7, 'Scikitlearn': 7, 'Stats models': 1, 'TensorFlow': 8, 'MXNet': 1, 'SageMaker': 2, 'R': 38, 'API development': 1, 'Java frameworks': 1, 'Web services': 1, 'UI development': 1, 'Git': 6, 'Atlassian Jira': 1, 'Atlassian Confluence': 1, 'Slack': 1, 'Agile methodology': 1, 'Computer science': 6, 'Mathematics': 9, 'Physics': 2, 'Economics': 5, 'Engineering': 8, 'Operations research': 3, 'Quantitative social science': 1, '': 6940, 'Python': 65, 'RDBMS': 1, 'Kafka': 3, 'SQL': 45, 'PySpark': 4, 'Hive': 9, 'Logistic regression': 5, 'Naïve Bayes': 1, 'SVM': 2, 'Decision trees': 1, 'AWS': 10, 'Java': 12, 'Tensorflow': 3, 'PyTorch': 8, 'Natural Language Processing': 9, 'ML systems': 1, 'Streaming data flows': 1, 'Electrical engineering': 1, 'Econometrics': 5, 'Signal processing': 1, 'ETL data pipeline': 1,

In [4]:
# Import process from thefuzz
from thefuzz import process # Levenshtein algorithm

# Store the unique values of cuisine_type in unique_types
unique_types = cols_count.keys()

# Function to clean DataFrame based on similarity scores
def clean_dataframe(df, threshold=80):
    # Count occurrences of each entry
    cols_count = df.stack().value_counts().to_dict()
    
    # Calculate similarity of 'Data Analytics', 'Data Visualization', 'Tableau', and 'Power BI' to all values
    skills = ['Data Analysis','Data Analytics', 'Data Visualization', 'Tableau', 'Power BI', 'SQL','.Net','A/B Testing','Business Intelligence','Database Management','Data Science']
    
    for skill in skills:
        matches = process.extract(skill, cols_count.keys())
        # Iterate through the list of matches
        for match in matches:
            # Check whether the similarity score is greater than or equal to threshold
            if match[1] >= threshold:
                # Replace the matched entry in DataFrame
                df.replace(match[0], skill, inplace=True)
    
    return df


# Applying cleaning function to DataFrame
test = clean_dataframe(skills_df)
test = test.replace('.*AWS.*', 'AWS', regex=True)
test = test.replace('.*Amazon.*', 'AWS', regex=True)
test = test.replace('.*Agile.*', 'Agile Method', regex=True)
test = test.replace('.*SQL.*', 'SQL', regex=True)
test = test.replace('.*API.*', 'API', regex=True)
test = test.replace('.*Azure.*', 'Azure', regex=True)
test = test.replace('.*Tableau.*', 'Data Visualization', regex=True)
test = test.replace('.*Power BI.*', 'Data Visualization', regex=True)
test = test.replace('.*Business Intelligence*', 'Data Visualization', regex=True)
test = test.replace('.*Data Visualisation*', 'Data Visualization', regex=True)
test = test.replace('.*Data Integration*', 'ETL/ELT', regex=True)
test = test.replace('.*Computer Science*', 'ETL/ELT', regex=True)
test = test.replace('.*Data validation*', 'ETL/ELT', regex=True)
test = test.replace('.*Data collection*', 'ETL/ELT', regex=True)
test = test.replace('.*Data consistency*', 'ETL/ELT', regex=True)
test = test.replace('.*Data integrity*', 'Data Analytics', regex=True)
test = test.replace('.*Data Analysis*', 'Data Analytics', regex=True)
test = test.replace('.*Data Exploration*', 'Data Analytics', regex=True)
test = test.replace('.*Extraction*', 'ETL/ELT', regex=True)
test = test.replace('.*Database*', 'Database Management', regex=True)
test = test.replace('.*Data Engineering*', 'ETL/ELT', regex=True)
test = test.replace('.*Data Wrangling*', 'ETL/ELT', regex=True)
test = test.replace('.*Data Pipelines*', 'ETL/ELT', regex=True)
test = test.replace('.*Data Security*', 'ETL/ELT', regex=True)
test = test.replace('.*Data Modeling*', 'ETL/ELT', regex=True)
tset = test.replace('.*Data Management*', 'ETL/ELT', regex=True)
test = test.replace('.*python*', 'Python', regex=True)

cols_count2 = {}
for row in [test.iloc[i,:].tolist() for i in range(0,len(test))]:
    for entry in row:
        if entry in cols_count2.keys():
            cols_count2[entry] += 1
        else:
            cols_count2[entry] = 1
            
# print(cols_count2)


test2 = pd.DataFrame.from_dict(cols_count2,orient='index')
test2 = test2.reset_index()
# print(test2.info())
test2.rename(columns={'index':'skill',0:'counts'},inplace = True)


test2_sorted = test2[test2['skill'] != ''].sort_values(by='counts', ascending=False)
# Print the sorted DataFrame
print(test2_sorted.head(20))

# filter_dict = {}
# for row, item in enumerate(cols_count2):
#     for key, value in enumerate(item):
#         if int(value) > 5:
#             filter_dict[key] = value

# print(filter_dict)
# print(sorted((cols_count2.values())))

# value = {i for i in cols_count2 if cols_count2[i]==696}
# print("key by value:",value)

                  skill  counts
93   Data Visualization      75
31               Python      65
34                  SQL      56
110      Data Analytics      55
61         Data Science      45
65     Machine Learning      45
90              ETL/ELT      45
13                    R      38
5            Statistics      26
245       Communication      20
75                Spark      18
76               Hadoop      18
0      Machine learning      17
41                  AWS      17
205         Data Mining      16
100        Data science      12
114          Clustering      12
42                 Java      12
1            Regression      11
115      Classification      10
