<a href="https://colab.research.google.com/github/zaki-m-khan/swytchstation2B/blob/main/resume-data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [45]:
#import statements
import os
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

In [48]:
#reading in the data
file_path = "data/UpdatedResumeDataSet.csv"
df = pd.read_csv(file_path)

In [58]:
!rm -rf swytchstation2B

In [49]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [50]:
# look at data
# df.head(15)
df['Category'].unique()

array(['Data Science', 'HR', 'Advocate', 'Arts', 'Web Designing',
       'Mechanical Engineer', 'Sales', 'Health and fitness',
       'Civil Engineer', 'Java Developer', 'Business Analyst',
       'SAP Developer', 'Automation Testing', 'Electrical Engineering',
       'Operations Manager', 'Python Developer', 'DevOps Engineer',
       'Network Security Engineer', 'PMO', 'Database', 'Hadoop',
       'ETL Developer', 'DotNet Developer', 'Blockchain', 'Testing'],
      dtype=object)

In [51]:
#cleaning the dataset
df_clean = df[~df['Category'].isin(['Arts', 'HR', 'Health and fitness', 'Advocate'])]
print(df_clean)

         Category                                             Resume
0    Data Science  Skills * Programming Languages: Python (pandas...
1    Data Science  Education Details \r\nMay 2013 to May 2017 B.E...
2    Data Science  Areas of Interest Deep Learning, Control Syste...
3    Data Science  Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4    Data Science  Education Details \r\n MCA   YMCAUST,  Faridab...
..            ...                                                ...
957       Testing  Computer Skills: â¢ Proficient in MS office (...
958       Testing  â Willingness to accept the challenges. â ...
959       Testing  PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...
960       Testing  COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...
961       Testing  Skill Set OS Windows XP/7/8/8.1/10 Database MY...

[832 rows x 2 columns]


In [52]:
print ("Displaying the distinct categories of resume and the number of records belonging to each category:\n\n")
print (df_clean['Category'].value_counts())

Displaying the distinct categories of resume and the number of records belonging to each category:


Category
Java Developer               84
Testing                      70
DevOps Engineer              55
Python Developer             48
Web Designing                45
Hadoop                       42
Sales                        40
Mechanical Engineer          40
Data Science                 40
Blockchain                   40
ETL Developer                40
Operations Manager           40
Database                     33
Electrical Engineering       30
PMO                          30
DotNet Developer             28
Business Analyst             28
Automation Testing           26
Network Security Engineer    25
Civil Engineer               24
SAP Developer                24
Name: count, dtype: int64


In [53]:
df['Resume'] = df['Resume'].str.replace(r'[^\x00-\x7F]+', ' ', regex=True)  # remove non-ASCII
df['Resume'] = df['Resume'].str.replace(r'\\r\\n', ' ', regex=True)         # remove literal '\r\n'
df['Resume'] = df['Resume'].str.replace(r'\s+', ' ', regex=True).str.strip() # clean up whitespace

print(df['Resume'].head())

0    Skills * Programming Languages: Python (pandas...
1    Education Details May 2013 to May 2017 B.E UIT...
2    Areas of Interest Deep Learning, Control Syste...
3    Skills R Python SAP HANA Tableau SAP HANA SQL ...
4    Education Details MCA YMCAUST, Faridabad, Hary...
Name: Resume, dtype: object


In [54]:
# drop duplicate resumes
df = df.drop_duplicates(subset='Resume', keep='first')
df.duplicated(subset='Resume').sum()

np.int64(0)

In [55]:
skills_list = [
    "machine learning", "deep learning", "data visualization", "data analysis",
    "data engineering", "data mining", "feature engineering",
    "statistical modeling", "predictive modeling", "object oriented programming",
    "software engineering", "web development", "backend development",
    "frontend development", "cloud computing", "devops", "nlp", "computer vision",
    "project management", "time management", "problem solving",
    "communication", "collaboration", "leadership", "teamwork",
    "critical thinking", "troubleshooting", "research", "testing"
]

tools_list = [
    # programming languages
    "python", "java", "c", "c++", "c#", "r", "scala", "typescript", "javascript",
    # data science / ml tools
    "pandas", "numpy", "matplotlib", "tensorflow", "pytorch", "keras", "xgboost",
    "scikit-learn", "spss", "sas", "stata", "tableau", "power bi", "looker",
    # databases
    "sql", "mysql", "postgresql", "mongodb", "sqlite", "snowflake", "bigquery", "redshift",
    # devops / cloud
    "aws", "azure", "gcp", "docker", "kubernetes", "jenkins", "terraform", "ansible",
    "git", "github", "gitlab", "bitbucket", "linux",
    # web frameworks
    "flask", "django", "express", "spring boot", "fastapi", "node.js", "react", "vue", "angular",
    # ai / visualization
    "opencv", "powerpoint", "excel", "figma", "canva", "jira", "asana", "notion"
]

education_list = [
    "Bachelors", "Bachelor's", "B.Sc", "BS", "BA", "B.Tech.", "B.E.", "B.Tech", "B.E",
    "Masters", "Master's", "M.Sc", "MS", "MBA",
    "PhD", "Doctorate", "High School",
    "Diploma", "Certification"
]

degree_map = {
    "Bachelors": "Bachelors",
    "Bachelor's": "Bachelors",
    "BS": "Bachelors",
    "B.Sc": "Bachelors",
    "B.Tech.": "Bachelors",
    "B.E.": "Bachelors",
    "B.Tech": "Bachelors",
    "B.E": "Bachelors",
    "Masters": "Masters",
    "Master's": "Masters",
    "MS": "Masters",
    "M.Sc": "Masters",
    "MBA": "Masters",
    "PhD": "PhD",
    "Doctorate": "PhD"
}

import re

def extract_keywords(text, keywords):
    found = []
    for word in keywords:
        # case insensitive search
        if re.search(r'\b' + re.escape(word) + r'\b', text, flags=re.IGNORECASE):
            found.append(word)
    return ", ".join(found) if found else None

df['Skills'] = df['Resume'].apply(lambda x: extract_keywords(str(x), skills_list))
df['Tools'] = df['Resume'].apply(lambda x: extract_keywords(str(x), tools_list))
df['Education'] = df['Resume'].apply(lambda x: extract_keywords(str(x), education_list))

# degree mapping
df['Education'] = df['Education'].apply(lambda x: ", ".join([degree_map.get(word, word) for word in str(x).split(", ")]))

df[['Resume', 'Skills', 'Tools', 'Education']].head()

Unnamed: 0,Resume,Skills,Tools,Education
0,Skills * Programming Languages: Python (pandas...,"machine learning, deep learning, computer visi...","python, java, javascript, pandas, numpy, matpl...",
1,Education Details May 2013 to May 2017 B.E UIT...,"machine learning, research","python, keras, aws, github",Bachelors
2,"Areas of Interest Deep Learning, Control Syste...","machine learning, deep learning, data analysis...","python, java, sql, mysql, github, linux, flask...",Bachelors
3,Skills R Python SAP HANA Tableau SAP HANA SQL ...,"machine learning, deep learning, nlp, communic...","python, c, r, tableau, sql",Masters
4,"Education Details MCA YMCAUST, Faridabad, Hary...",data analysis,"python, java, c",


In [56]:
# Check for None in Education
df[df['Education'].isna() | (df['Education'] == 'None')]
print(df['Education'].isna().sum() + (df['Education'] == 'None').sum())

44
