# In this notebook we will extract skills from a pdf file 

### We will start by loading SKILLNER 

In [2]:
import warnings
warnings.filterwarnings('ignore')
import spacy
from spacy.matcher import PhraseMatcher
from spacy.matcher import Matcher
import inspect

# load default skills data base
from skillNer.general_params import SKILL_DB
# import skill extractor
from skillNer.skill_extractor_class import SkillExtractor
from  skillNer.matcher_class import SkillsGetter
from  skillNer.matcher_class import Matchers


# init params of skill extractor
import en_core_web_sm

nlp = en_core_web_sm.load()
# init skill extractor
skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)

source_file = inspect.getsourcefile(SkillExtractor)

print(source_file)

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
C:\Users\MSi\AppData\Roaming\Python\Python39\site-packages\skillNer\skill_extractor_class.py


### Function to extract text from all pages in a specific pdf files in a specific folder

In [3]:
import os
import PyPDF2
import pandas as pd

# Function to extract text from a PDF file starting from a given word
def extract_text_from_pdf(file_path, start_word):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        extracted_text = ""

        for page in reader.pages:
            text = page.extract_text()
            start_index = text.find(start_word)
            if start_index != -1:
                extracted_text += text[start_index:]
            else:
                extracted_text += text

    return extracted_text

# Folder path containing the PDF files
folder_path = 'C:/Users/MSi/Desktop/comp_based/OptionDS'

# Start word for text extraction
start_word = 'Acquis d’apprentissage'

# Create an empty DataFrame with the desired columns
df = pd.DataFrame(columns=['text', 'course'])

for filename in os.listdir(folder_path):
    if filename.endswith('.pdf'):
        file_path = os.path.join(folder_path, filename)
        extracted_text = extract_text_from_pdf(file_path, start_word)
        
        # Add a new row to the DataFrame
        df = df.append({'text': extracted_text, 'course': filename}, ignore_index=True)

# Print the DataFrame
print(df)


                                                 text  \
0   Acquis d’apprentissage  :  \nÀ la validation d...   
1   Module: Deep Learning  \nMachine Learning, Alg...   
2   Acquis d’apprentissage  :  \nà la validation d...   
3   Acquis d’apprentissage  :  \nà la validation d...   
4    \n1 \n \nUE : Traitement des données  \nDerni...   
5    \n1 \n \nUE :Projet Data Science    \nDernie ...   
6   Module : ARCHITECTURE DES SI I\nDernière mise ...   
7   Acquis d’apprentissage :  \nÀ la validation de...   
8    \n \nModule : Sécurité Informatique  \nDerniè...   
9   Acquis d’apprentissage :   \nà la validation d...   
10  Page 1 sur 8  \n \nArchitecture des SI II  spr...   
11  Acquis d’apprentissage :  à la validation de c...   
12   \n1 \n \n \nModule  : Machine Learning  \n  D...   
13   \n \nModule  : \nInnovation  et Entreprenaria...   
14  Module : Probabilités 1               A.U: 202...   
15   \n-  \n-  \n-  \n- Module : Services Web  \n-...   
16  UE : Communication, culture

In [4]:
df.head(20)

Unnamed: 0,text,course
0,Acquis d’apprentissage : \nÀ la validation d...,bigdata.pdf
1,"Module: Deep Learning \nMachine Learning, Alg...",deep.pdf
2,Acquis d’apprentissage : \nà la validation d...,Fiche module DBA 2022-2023.pdf
3,Acquis d’apprentissage : \nà la validation d...,Fiche module ST-DS.pdf
4,\n1 \n \nUE : Traitement des données \nDerni...,Fiche UE traitement de donnees.pdf
5,\n1 \n \nUE :Projet Data Science \nDernie ...,FicheUE_ProjetDS.pdf
6,Module : ARCHITECTURE DES SI I\nDernière mise ...,Fiche_module_ASI.pdf
7,Acquis d’apprentissage : \nÀ la validation de...,FM Prog-linéaire _AU_22__23.pdf
8,\n \nModule : Sécurité Informatique \nDerniè...,FM-Sécurité Informatique 22-23.pdf
9,Acquis d’apprentissage : \nà la validation d...,FM_Projet - RO_Complexite_2021_2022.pdf


### Here we will apply some NLP techniques to Clean the extracted text 

In [6]:
import re

df['text'] = df['text'].apply(lambda x: re.sub('\t','', x))
df['text'] = df['text'].apply(lambda x: re.sub('\uf02d',' ', x))
df['text'] = df['text'].apply(lambda x: re.sub('\n',' ', x))

df['text'] = df['text'].apply(lambda x: re.sub(']','', x))
html = re.compile(r"<*?X>|&([a-z0-9]+|[0-9]{1,6}|x[0-9a-f]{1,6});")
df['text'] = df['text'].apply(lambda x: re.sub(html,' ', x))
special_chars = r"[*\/\[\]\(\)\?:;.&%,X'']"
df['text'] = df['text'].str.replace(special_chars, '')


In [8]:
df.head(20)

Unnamed: 0,text,course
0,Acquis d’apprentissage À la validation de ...,bigdata.pdf
1,Module Deep Learning Machine Learning Algori...,deep.pdf
2,Acquis d’apprentissage à la validation de ...,Fiche module DBA 2022-2023.pdf
3,Acquis d’apprentissage à la validation de ...,Fiche module ST-DS.pdf
4,1 UE Traitement des données Dernière m...,Fiche UE traitement de donnees.pdf
5,1 UE Projet Data Science Dernie re mi...,FicheUE_ProjetDS.pdf
6,Module ARCHITECTURE DES SI I Dernière mise à ...,Fiche_module_ASI.pdf
7,Acquis d’apprentissage À la validation de c...,FM Prog-linéaire _AU_22__23.pdf
8,Module Sécurité Informatique Dernière m...,FM-Sécurité Informatique 22-23.pdf
9,Acquis d’apprentissage à la validation de ...,FM_Projet - RO_Complexite_2021_2022.pdf


# Using Skillner To extract Skills from the extracted text from Evry pdf file in a specific Folder

In [9]:
data_frames = []
for i in range(len(df)):
    annotations = skill_extractor.annotate(df['text'][i])
    data = skill_extractor.describe(annotations)
    data['course']=df['course'][i]
    data_frames.append(data)
final_data = pd.concat(data_frames)    


# Final Output

In [10]:
final_data.head(100)

Unnamed: 0,raw_text,skill_type,skill_name,course
0,big datum,Hard Skill,Big Data,bigdata.pdf
1,ses,Hard Skill,Amazon Simple Email Service (SES),bigdata.pdf
2,mapreduce,Hard Skill,MapReduce,bigdata.pdf
3,en,Hard Skill,EN 1993 Building Codes,bigdata.pdf
4,spark streaming,Hard Skill,Spark Streaming,bigdata.pdf
...,...,...,...,...
0,vues,Hard Skill,Vue.js,Fiche module DBA 2022-2023.pdf
1,vues,Hard Skill,Vue.js,Fiche module DBA 2022-2023.pdf
2,vues,Hard Skill,Vue.js,Fiche module DBA 2022-2023.pdf
3,vues,Hard Skill,Vue.js,Fiche module DBA 2022-2023.pdf


In [41]:
final_data.drop_duplicates(subset=['skill_name', 'course'], inplace=True)

In [42]:
final_data.to_excel('data_fiche_ds_2.xlsx')