<center> <H3> Functions </H3> </center>

In [None]:
#Required Libraries

import pandas as pd
import os
import docx
import nltk
from wordcloud import WordCloud, STOPWORDS
from nltk import ngrams
from collections import Counter
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
import pyresparser


nltk.download('stopwords')
nltk.download('punkt')

#### Function to join all the dataset

In [None]:
# Function to concatinae all resumes to one single dataframe

def resume_datasets(dir_path, name):
    
    # reading all csv files in the directory
    df_list = [pd.read_csv(os.path.join(dir_path, filename)) 
               for filename in os.listdir(dir_path) 
               if filename.endswith(".csv")]
    
    if len(df_list) > 0:
        # concatinating all scv files to one single file
        result = pd.concat(df_list, ignore_index=True)
        result.to_csv(name + ".csv", index=False)
        print(f"Concatenated {len(df_list)} CSV files to {name}.csv")
    else:
        print("No CSV files found in directory.")

#### Function to clean the data

In [None]:
## Function to clean the resume using regular expression

def clean_resume(text):

    text = str(text)
    stop_words = set(stopwords.words('english'))
    text = re.sub('http\S+\s*', ' ', text)  # remove URLs
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text)  # remove punctuations
    text = re.sub(r'\s+', ' ', text)

    #replacing words with numbers
    #Eg: I have one year of experience -> I have 1 year of experience
    text = re.sub(r'\b(?:one|two|three|four|five|six|seven|eight|nine|ten)\b', '', text, flags=re.IGNORECASE) 

    #removing stop words
    tokens = nltk.word_tokenize(text.lower()) #tokenize and convert to lower case
    tokens = [word for word in tokens if word not in stop_words] 

    return ' '.join(tokens)

In [None]:
# function to clean text from skills, degrees, designation

def clean_text(skills):

  skills = str(skills).replace('[','').replace(']','').replace("'",'')
  skills = skills.replace(' ','').lower()
  skills = skills.replace(',',' ')

  return skills

#### Function to extract information from resume

In [None]:
#using pyresparser library to extract features from resume

def extract_info(text):
    # creating a new Word document
    doc = docx.Document()
    doc.add_paragraph(text)
    doc.save("temp.docx")
    
    # using PyResparser to extract information from the resume
    extracted_info = pyresparser.ResumeParser("temp.docx").get_extracted_data()

    # extracting name, email and skills from the extracted information
    name = extracted_info['name']
    email = extracted_info['email']
    skills = extracted_info['skills']

    return name, email, skills

#### Function to show wordcloud

In [None]:
# generating wordcloud on resume skills
# genarting wordcloud with bigrams

def generate_wordcloud(text, n):
    # generating n-grams
    n_grams = ngrams(text.split(), n)
    freq_dict = Counter([' '.join(n_gram) for n_gram in n_grams])
    
    # creating wordcloud
    wordcloud = WordCloud(width=800, height=400, background_color='white', min_font_size=10,
                      stopwords=STOPWORDS)
    wordcloud.generate_from_frequencies(freq_dict)

    # displaying wordcloud
    plt.figure(figsize=(12, 10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout(pad=0)
    plt.show()
