In [1]:
import pandas as pd   # For data manipulation
import nltk           # For natural language processing
import re             # For regular expressions (text cleaning)
nltk.download('stopwords')  # Download common stopwords
nltk.download('punkt')      # Download tokenizer models
nltk.download('wordnet')    # Download WordNet lemmatizer data

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\prana/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\prana/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df = pd.read_csv(r'C:\Users\prana\OneDrive\Desktop\infosys\job_title_des.csv')
print(df.head()) 


   Unnamed: 0             Job Title  \
0           0     Flutter Developer   
1           1      Django Developer   
2           2      Machine Learning   
3           3         iOS Developer   
4           4  Full Stack Developer   

                                     Job Description  
0  We are looking for hire experts flutter develo...  
1  PYTHON/DJANGO (Developer/Lead) - Job Code(PDJ ...  
2  Data Scientist (Contractor)\n\nBangalore, IN\n...  
3  JOB DESCRIPTION:\n\nStrong framework outside o...  
4  job responsibility full stack engineer – react...  


In [3]:
df["text_lower_JD"] = df["Job Description"].str.lower()
df.head()


Unnamed: 0.1,Unnamed: 0,Job Title,Job Description,text_lower_JD
0,0,Flutter Developer,We are looking for hire experts flutter develo...,we are looking for hire experts flutter develo...
1,1,Django Developer,PYTHON/DJANGO (Developer/Lead) - Job Code(PDJ ...,python/django (developer/lead) - job code(pdj ...
2,2,Machine Learning,"Data Scientist (Contractor)\n\nBangalore, IN\n...","data scientist (contractor)\n\nbangalore, in\n..."
3,3,iOS Developer,JOB DESCRIPTION:\n\nStrong framework outside o...,job description:\n\nstrong framework outside o...
4,4,Full Stack Developer,job responsibility full stack engineer – react...,job responsibility full stack engineer – react...


In [4]:
import string

# Define which punctuation to remove (all standard punctuation)
PUNCT_TO_REMOVE = string.punctuation

# Custom function to remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

# Apply the function to the column 'text_lower_JD' and create a new column
df["text_wo_punct"] = df["text_lower_JD"].apply(lambda text: remove_punctuation(str(text)))

# View first rows to verify
print(df[["text_lower_JD", "text_wo_punct"]].head())

                                       text_lower_JD  \
0  we are looking for hire experts flutter develo...   
1  python/django (developer/lead) - job code(pdj ...   
2  data scientist (contractor)\n\nbangalore, in\n...   
3  job description:\n\nstrong framework outside o...   
4  job responsibility full stack engineer – react...   

                                       text_wo_punct  
0  we are looking for hire experts flutter develo...  
1  pythondjango developerlead  job codepdj  04\ns...  
2  data scientist contractor\n\nbangalore in\n\nr...  
3  job description\n\nstrong framework outside of...  
4  job responsibility full stack engineer – react...  


In [5]:
from nltk.corpus import stopwords

# Download stopwords if not already done
import nltk
nltk.download('stopwords')

# Create a set of English stopwords
STOPWORDS = set(stopwords.words('english'))

# Define a function to remove stopwords from a text string
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

# Apply the function to the 'text_wo_punct' column and create a new column 'text_wo_stop'
df["text_wo_stop"] = df["text_wo_punct"].apply(lambda text: remove_stopwords(text))

# Display first few rows to verify
print(df[["text_wo_punct", "text_wo_stop"]].head())

                                       text_wo_punct  \
0  we are looking for hire experts flutter develo...   
1  pythondjango developerlead  job codepdj  04\ns...   
2  data scientist contractor\n\nbangalore in\n\nr...   
3  job description\n\nstrong framework outside of...   
4  job responsibility full stack engineer – react...   

                                        text_wo_stop  
0  looking hire experts flutter developer eligibl...  
1  pythondjango developerlead job codepdj 04 stro...  
2  data scientist contractor bangalore responsibi...  
3  job description strong framework outside ios a...  
4  job responsibility full stack engineer – react...  


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from collections import Counter

# Count word frequencies in the 'text_wo_stop' column
cnt = Counter()
for text in df["text_wo_stop"].values:
    for word in text.split():
        cnt[word] += 1
        
# Get the top 10 most common words
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
print("Most frequent words:", FREQWORDS)

# Function to remove the most frequent words from text
def remove_freqwords(text):
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

# Apply the function to remove these frequent words and create a new column
df["text_wo_stopfreq"] = df["text_wo_stop"].apply(lambda text: remove_freqwords(text))

# Display to verify the result
print(df[["text_wo_stop", "text_wo_stopfreq"]].head())



Most frequent words: {'years', 'experience', 'work', 'data', 'design', 'development', 'skills', 'knowledge', 'team', 'software'}
                                        text_wo_stop  \
0  looking hire experts flutter developer eligibl...   
1  pythondjango developerlead job codepdj 04 stro...   
2  data scientist contractor bangalore responsibi...   
3  job description strong framework outside ios a...   
4  job responsibility full stack engineer – react...   

                                    text_wo_stopfreq  
0  looking hire experts flutter developer eligibl...  
1  pythondjango developerlead job codepdj 04 stro...  
2  scientist contractor bangalore responsibilitie...  
3  job description strong framework outside ios a...  
4  job responsibility full stack engineer – react...  


In [7]:
from collections import Counter
import random

# Count frequencies again on your chosen column (e.g., 'text_wo_stop')
cnt = Counter()
for text in df["text_wo_stop"].values:
    for word in text.split():
        cnt[word] += 1

# Get rare words occurring fewer than a threshold (e.g. 3 times)
threshold = 3
rare_words = [w for w, c in cnt.items() if c < threshold]

# Randomly sample some rare words to inspect (set sample size)
sample_size = 30
sampled_rare_words = random.sample(rare_words, min(sample_size, len(rare_words)))
print("Sampled rare words for inspection:")
print(sampled_rare_words)


Sampled rare words for inspection:
['rolethis', 'orms', 'luke', 'locationtravel', '88', 'railsstrategyreportingautomation', 'gmlan', '·wordpress', 'githubbitbucket', 'techno', 'lastline', 'blockchains', 'suse', 'benefiteligible', 'hyperdimensions', 'segregation', 'createsoversees', 'oraclebased', 'clientproduct', 'streamed', 'orchestrators', 'savior', 'strapped', 'wwwconnellypartnerscom', 'projectsfeatures', 'fenestration', 'hinder', 'youâ€™ll', 'highlyscalableperformant', 'identificationfixing']


In [8]:
whitelist = {"flutter", "django", "react", "python", "contractor", "engineer", "kubernetes", "aws", "hadoop","akamai", "galera", "unicloud™", "unirsm™", "ahmadabad"}


In [9]:
RAREWORDS = set(rare_words)

def remove_rarewords_with_whitelist(text):
    return " ".join([word for word in str(text).split() if (word not in RAREWORDS or word in whitelist)])

df["text_wo_stoprare"] = df["text_wo_stop"].apply(remove_rarewords_with_whitelist)

# Verify results
print(df[["text_wo_stop", "text_wo_stoprare"]].head())

                                        text_wo_stop  \
0  looking hire experts flutter developer eligibl...   
1  pythondjango developerlead job codepdj 04 stro...   
2  data scientist contractor bangalore responsibi...   
3  job description strong framework outside ios a...   
4  job responsibility full stack engineer – react...   

                                    text_wo_stoprare  
0  looking hire experts flutter developer eligibl...  
1  pythondjango job 04 strong python experience a...  
2  data scientist contractor bangalore responsibi...  
3  job description strong framework outside ios a...  
4  job responsibility full stack engineer – react...  


In [10]:
import nltk
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\prana/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [11]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\prana/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to C:\Users\prana/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [12]:
# Define whitelist of important words to keep
whitelist = {"flutter", "django", "react", "python", "contractor", "engineer", "kubernetes", "aws", "hadoop"}

# Assume RAREWORDS is already defined as a set of rare words
def remove_rarewords_with_whitelist(text):
    return " ".join([word for word in str(text).split() if (word not in RAREWORDS or word in whitelist)])

# Apply the function
df["text_wo_stoprare"] = df["text_wo_stop"].apply(remove_rarewords_with_whitelist)

# Verify results
print(df[["text_wo_stop", "text_wo_stoprare"]].head())


                                        text_wo_stop  \
0  looking hire experts flutter developer eligibl...   
1  pythondjango developerlead job codepdj 04 stro...   
2  data scientist contractor bangalore responsibi...   
3  job description strong framework outside ios a...   
4  job responsibility full stack engineer – react...   

                                    text_wo_stoprare  
0  looking hire experts flutter developer eligibl...  
1  pythondjango job 04 strong python experience a...  
2  data scientist contractor bangalore responsibi...  
3  job description strong framework outside ios a...  
4  job responsibility full stack engineer – react...  


In [13]:
print(df.columns)


Index(['Unnamed: 0', 'Job Title', 'Job Description', 'text_lower_JD',
       'text_wo_punct', 'text_wo_stop', 'text_wo_stopfreq',
       'text_wo_stoprare'],
      dtype='object')


In [14]:
import nltk
nltk.download('averaged_perceptron_tagger')  # Correct POS tagger resource
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\prana/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\prana/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\prana/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [15]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in str(text).split()])

df["text_lemmatized"] = df["text_wo_stoprare"].apply(lemmatize_words)
print(df[["text_wo_stoprare", "text_lemmatized"]].head())


                                    text_wo_stoprare  \
0  looking hire experts flutter developer eligibl...   
1  pythondjango job 04 strong python experience a...   
2  data scientist contractor bangalore responsibi...   
3  job description strong framework outside ios a...   
4  job responsibility full stack engineer – react...   

                                     text_lemmatized  
0  looking hire expert flutter developer eligible...  
1  pythondjango job 04 strong python experience a...  
2  data scientist contractor bangalore responsibi...  
3  job description strong framework outside io al...  
4  job responsibility full stack engineer – react...  


In [16]:
import nltk
print(nltk.data.path)


['C:\\Users\\prana/nltk_data', 'C:\\Users\\prana\\anaconda3\\nltk_data', 'C:\\Users\\prana\\anaconda3\\share\\nltk_data', 'C:\\Users\\prana\\anaconda3\\lib\\nltk_data', 'C:\\Users\\prana\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']


In [19]:
# First, download the required NLTK resources
import nltk
nltk.download('averaged_perceptron_tagger_eng')  # Correct resource name (without _eng)
nltk.download('wordnet')  # Also download wordnet if not already downloaded

# Now the original code should work
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())  # This will now work with the downloaded resource
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

df["text_lemmatized_2"] = df["text_wo_stoprare"].apply(lambda text: lemmatize_words(text))
df.head()

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\prana/nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to C:\Users\prana/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0.1,Unnamed: 0,Job Title,Job Description,text_lower_JD,text_wo_punct,text_wo_stop,text_wo_stopfreq,text_wo_stoprare,text_lemmatized,text_lemmatized_2
0,0,Flutter Developer,We are looking for hire experts flutter develo...,we are looking for hire experts flutter develo...,we are looking for hire experts flutter develo...,looking hire experts flutter developer eligibl...,looking hire experts flutter developer eligibl...,looking hire experts flutter developer eligibl...,looking hire expert flutter developer eligible...,look hire expert flutter developer eligible po...
1,1,Django Developer,PYTHON/DJANGO (Developer/Lead) - Job Code(PDJ ...,python/django (developer/lead) - job code(pdj ...,pythondjango developerlead job codepdj 04\ns...,pythondjango developerlead job codepdj 04 stro...,pythondjango developerlead job codepdj 04 stro...,pythondjango job 04 strong python experience a...,pythondjango job 04 strong python experience a...,pythondjango job 04 strong python experience a...
2,2,Machine Learning,"Data Scientist (Contractor)\n\nBangalore, IN\n...","data scientist (contractor)\n\nbangalore, in\n...",data scientist contractor\n\nbangalore in\n\nr...,data scientist contractor bangalore responsibi...,scientist contractor bangalore responsibilitie...,data scientist contractor bangalore responsibi...,data scientist contractor bangalore responsibi...,data scientist contractor bangalore responsibi...
3,3,iOS Developer,JOB DESCRIPTION:\n\nStrong framework outside o...,job description:\n\nstrong framework outside o...,job description\n\nstrong framework outside of...,job description strong framework outside ios a...,job description strong framework outside ios a...,job description strong framework outside ios a...,job description strong framework outside io al...,job description strong framework outside io al...
4,4,Full Stack Developer,job responsibility full stack engineer – react...,job responsibility full stack engineer – react...,job responsibility full stack engineer – react...,job responsibility full stack engineer – react...,job responsibility full stack engineer – react...,job responsibility full stack engineer – react...,job responsibility full stack engineer – react...,job responsibility full stack engineer – react...


In [20]:
df.to_csv('your_dataset_lemmatized.csv', index=False)


In [21]:
from unidecode import unidecode

df['text_lemmatized_clean'] = df['text_lemmatized'].apply(lambda x: unidecode(str(x)))


In [22]:
df.to_csv('your_dataset_lemmatized.csv', index=False)


In [23]:
import re

def clean_text(text):
    text = str(text)
    # Remove special characters except apostrophes or keep only letters and spaces
    text = re.sub(r"[^a-zA-Z0-9\s']", '', text)  
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)  
    # Strip leading/trailing whitespace
    text = text.strip()
    return text

df['text_cleaned'] = df['text_lemmatized'].apply(clean_text)

print(df[['text_lemmatized', 'text_cleaned']].head())


                                     text_lemmatized  \
0  looking hire expert flutter developer eligible...   
1  pythondjango job 04 strong python experience a...   
2  data scientist contractor bangalore responsibi...   
3  job description strong framework outside io al...   
4  job responsibility full stack engineer – react...   

                                        text_cleaned  
0  looking hire expert flutter developer eligible...  
1  pythondjango job 04 strong python experience a...  
2  data scientist contractor bangalore responsibi...  
3  job description strong framework outside io al...  
4  job responsibility full stack engineer react r...  


In [24]:
df.to_csv('your_dataset_lemmatized.csv', index=False)