In [1]:
import os
os.chdir(os.getcwd().replace('/Code', ''))

In [2]:
from pathlib import Path
import glob
import pandas as pd
import gzip
import pickle as pkl
import re
from collections import Counter
import numpy as np
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt
import spacy

In [3]:
import unicodedata
def remove_accent(word):
    return unicodedata.normalize('NFD', word).encode('ascii', 'ignore').decode("utf-8")

In [4]:
nlp = spacy.load('en_core_web_md')

## Récupération de la matrice précédente avec les CV en anglais

In [5]:
with gzip.open('Matrices/CVs2_traductionALL_srcfr.pkl.gz', 'rb') as f:
        CVs_df = pkl.load(f)
        
CVs_df = CVs_df.drop('text_no_x', axis=1)
print(CVs_df.shape)

(8212, 3)


In [6]:
CVs_df.head(5)

Unnamed: 0_level_0,id_CV,len_cv_initial,traduction
id_CV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5711,5711,4477,Jaouadi - Junior Data Scientist\n\n\n Jaouadi...
669,669,6699,Breton\n\n\n \n75\n\n . . . . .\n Consult...
938,938,5769,94. : +33 (0) 1 45 18 33: 33 (....\n \n\n \...
1835,1835,2353,M\n\nMr. Nationality: French\nDate of birth: 1...
266,266,4407,de'Reguardati - Financial Engineer\n\n\n de'R...


**Si la colonne "traduction"==None alors le cv n'a pas été traduit**

In [7]:
id_cv_non_traduit = [cv for cv in CVs_df.index if CVs_df.loc[cv]["traduction"]==None ]
print(len(id_cv_non_traduit) , "cv n'ont pas été traduit")
CVs_df = CVs_df.drop(id_cv_non_traduit)
print(CVs_df.shape)

7 cv n'ont pas été traduit
(8205, 3)


### Suppression de certains caractères qui posent problème avec Spacy

In [8]:
CVs_df['traduction'] = [re.sub("[())/><]", ' ', CV) for CV in CVs_df["traduction"] ]

### On définit la liste de liste [ [text, id], [text, id],...] pour le multiprocessing

In [9]:
liste_CV_id = [[CVs_df.loc[cv]['traduction'], CVs_df.loc[cv]['id_CV']] for cv in CVs_df.index ]
print(len(liste_CV_id))

8205


# On utilise Spacy pour tokenizer, lemmatizer, enlever les stop words et la ponctuation

## multiprocess

In [10]:
def etape_map(CV_and_id):
    CV = CV_and_id[0]
    id_CV = CV_and_id[1]
    tokens = nlp(CV)
    list_word_cv = [remove_accent(w.lemma_.lower()) for w in tokens if w.is_punct==False 
                    and w.is_space==False and not w.is_stop and not w.is_digit]

    CV_join1 = " ".join(list_word_cv)
    
    #découpe le cv en mots en conservant c++, c#
    CV_list = re.findall("c\+\+|c#|[0-9a-z]+" ,CV_join1)
    
    #suppression des chiffres excepté ceux dans les mots j2ee...
    CV_nodecimal = [mot for mot in CV_list if mot.isdecimal()==False ]
    
    #suppression des mots de longueur 1 sauf R et C
    CV_len = [mot for mot in CV_nodecimal if mot=="r" or mot=="c" or len(mot)>=2 ]
   
    return ([id_CV], [CV_len])

def etape_reduce(a, b):
    a[0].append(b[0][0])
    a[1].append(b[1][0])
    return a

In [11]:
%%time
from multiprocessing import Pool
from functools import reduce

#multiprocess
with Pool() as pool:
    m = pool.map(etape_map, liste_CV_id)
    
r = reduce(etape_reduce, m)

CPU times: user 1.66 s, sys: 544 ms, total: 2.2 s
Wall time: 4min 53s


In [12]:
rdf = pd.DataFrame(r).transpose()
rdf.columns=['id_CV', "cv_list"]
rdf.index = rdf['id_CV']
print(rdf.shape)
rdf.head(1)

(8205, 2)


Unnamed: 0_level_0,id_CV,cv_list
id_CV,Unnamed: 1_level_1,Unnamed: 2_level_1
5711,5711,"[jaouadi, junior, data, scientist, jaouadi, ju..."


In [13]:
CVs_df_liste = rdf.join(CVs_df, rsuffix='r')
CVs_df_liste = CVs_df_liste.drop('id_CVr', axis=1)
print(CVs_df_liste.shape)
CVs_df_liste.head(1)

(8205, 4)


Unnamed: 0_level_0,id_CV,cv_list,len_cv_initial,traduction
id_CV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5711,5711,"[jaouadi, junior, data, scientist, jaouadi, ju...",4477,Jaouadi - Junior Data Scientist\n\n\n Jaouadi...


## Matrice Count

In [14]:
from sklearn.feature_extraction import DictVectorizer
count= [dict(Counter(cv)) for cv in r[1]]
dictvectorizer = DictVectorizer()
dictvectorizerFIT = dictvectorizer.fit_transform(count)
matrice_count = dictvectorizerFIT.toarray()
DataFrame_Count = pd.DataFrame(matrice_count,  columns=dictvectorizer.get_feature_names(), index=r[0])
print("Le vocabulaire de base contient", DataFrame_Count.shape[1], "tokens.")

Le vocabulaire de base contient 91920 tokens.


In [15]:
with gzip.open('Matrices/CVs3_tokenization.pkl.gz', 'wb') as f:
    pkl.dump(CVs_df_liste, f)