In [None]:
# import necessary libraries
import pandas as pd
import urllib.request as ul
from bs4 import BeautifulSoup as soup
from IPython.display import display



### Load all necessary dataset.

In [None]:
data_raw = pd.read_csv('Raw_Skills_Dataset.csv')
print(data_raw.isnull().any())
display(data_raw.head(10))

In [None]:
# load the dataset to see the list of hard technical skills
hard = pd.read_csv('Example_Technical_Skills.csv')
display(hard.tail(10))

### Exploratory Analysis

In [None]:
# Remove spaces from th beginning of words
data_raw['RAW DATA'] = data_raw['RAW DATA'].str.lstrip()
data_raw.head(20)

In [None]:
# Removing all the uncessary special characters.
data_raw = data_raw.replace(r'[^0-9a-zA-Z ]', ' ', regex=True).replace("'", "'")


### Generate soft skills dataset

In [None]:
# List of soft skills
'''These coded below extract about 135 soft skills and store them in a csv file format'''
url = 'https://www.developgoodhabits.com/soft-skills-list/'
req = ul.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
client = ul.urlopen(req)
htmldata = client.read()
client.close()

pagesoup = soup(htmldata, "html.parser")
soft_skills_list = pagesoup.findAll('h3')

In [None]:
# write to csv file
filename = 'soft_skills.csv'
f = open(filename, 'w', encoding='utf')
header = 'Soft Skills'
f.write(header + "\n")

for item in soft_skills_list:
    name_soft = item.findAll('span')
    if len(name_soft) != 0:
        name = name_soft[0].text
        name = name.split(' ', 1)[1]
        f.write(name + '\n')

f.close()



In [None]:
# load the soft skill data
soft = pd.read_csv('soft_skills.csv')
display(soft.head(10))

### Check if there is similar skills in both the hard and soft technical skills 

In [None]:
similar = pd.merge(hard, soft, left_on=['Technology Skills'], right_on=['Soft Skills'], how='inner')
print(len(similar))

'''From the above code it is clearly seen that the hard and soft skills were properly classified.'''

In [None]:
def hard_skills_extractor(data):
    
    try:
        for column in data.columns:
            if data[column].dtype == object:
                hard_skills = data.loc[data[column].isin(hard['Technology Skills']),:]
                if len(hard_skills) != 0:
                    hard_skills.columns = ['Hard Tech Skills']
                    hard_skills.to_csv('Hard_Technical_Skills.csv')
                    print('file extracted') 

    except FileNotFoundError:
        print("File could not be found")

    except:
        print('An Error occured') 

    return  

In [None]:
def soft_skills_extractor(data):
    
    try:
        for column in data.columns:
            if data[column].dtype == object:
                soft_skills = data.loc[data[column].isin(soft['Soft Skills']),:]
                if len(soft_skills) != 0:
                    soft_skills.columns = ['Soft Skills']
                    soft_skills.to_csv('Soft_Technical_Skills.csv')
                    print("file extracted")
    
    except FileNotFoundError:
        print("File could not be found")

    except:
        print('An Error occured')
    
    return

In [None]:
# # Code that extracts both soft and hard skills.
# def skills_extractor(data):
    
#     try:
#         for column in data.columns:
#             if data[column].dtype == object:
#                 hard_skills = data.loc[data[column].isin(hard['Technology Skills']),:]
#                 if len(hard_skills) != 0:
#                     hard_skills.columns = ['Hard Tech Skills']
#                     hard_skills.to_csv('Hard_Technical_Skills.csv') 
                
#                 soft_skills = data.loc[data[column].isin(soft['Soft Skills']),:]
#                 if len(soft_skills) != 0:
#                     soft_skills.columns = ['Soft Skills']
#                     soft_skills.to_csv('Soft_Technical_Skills.csv')
#                 print("Files extracted")

#     except FileNotFoundError:
#         print("File could not be found")

#     except:
#         print('An Error occured') 

#     return  

In [None]:
hard_skills_extractor(data_raw)

In [None]:
soft_skills_extractor(data_raw)