In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('New_Engineer.csv')
df.head()

Unnamed: 0,RequisitionID,OrigJobTitle,JobTitle,JobDescription
0,,Licensed Stationary Engineer,ENGINEER (all other),Licensed Stationary Engineer \n\n Froedtert So...
1,224907.0,Guidance Navigation and Control (GN&C) Enginee...,ENGINEER (all other),**The Boeing Company** is in search of a **L...
2,331804.0,"Propulsion Engineer - Associate, Mid-Level and...",ENGINEER (all other),"**Job Description**\n\nAt Boeing, we innovate ..."
3,336462.0,Senior Process Controls Engineer,ENGINEER (all other),"**Job Description**\n\nAt Boeing, we innovate ..."
4,338951.0,RF/Microwave Engineer (Level 2 or 3),ENGINEER (all other),"**Job Description**\n\nAt Boeing, we innovate ..."


#### First task is to clean the data, so let's first get rid of any punctuation and numbers in the OrigJobTitle column

In [4]:
import re
import string

Making a function to get rid of numbers, punctuation, and any letters adjacent to numbers for the OrigJobTitle feature column

In [5]:
# Function to clean job titles
def clean_job_title_final(title):
    # Remove numbers and adjacent characters
    cleaned_title = re.sub(r'[$0-9]+[\w]*', '', title)
    
    # Prepare a regex pattern that includes all punctuation marks except for whitespace
    punct_pattern = r"[{}]".format(re.escape(string.punctuation.replace(" ", "")))
    
    # Remove other punctuation marks except whitespace
    cleaned_title = re.sub(punct_pattern, '', cleaned_title)
    
    # Remove any leading or trailing spaces
    cleaned_title = cleaned_title.strip(' ')
    return cleaned_title


# Apply the final cleaning function to the 'OrigJobTitle' column
df['OrigJobTitle'] = df['OrigJobTitle'].apply(clean_job_title_final)
# Rename it for new CSV naming purposes
df.rename(columns={'OrigJobTitle': 'CleanedJobTitle'}, inplace=True)

df


Unnamed: 0,RequisitionID,CleanedJobTitle,JobTitle,JobDescription
0,,Licensed Stationary Engineer,ENGINEER (all other),Licensed Stationary Engineer \n\n Froedtert So...
1,00000224907,Guidance Navigation and Control GNC Engineer ...,ENGINEER (all other),**The Boeing Company** is in search of a **L...
2,00000331804,Propulsion Engineer Associate MidLevel and Ex...,ENGINEER (all other),"**Job Description**\n\nAt Boeing, we innovate ..."
3,00000336462,Senior Process Controls Engineer,ENGINEER (all other),"**Job Description**\n\nAt Boeing, we innovate ..."
4,00000338951,RFMicrowave Engineer Level or,ENGINEER (all other),"**Job Description**\n\nAt Boeing, we innovate ..."
...,...,...,...,...
20132,WYMP-23-12039375-MG,Mining Engineer,ENGINEER (all other),Summary Explore a new career with the BLM - wh...
20133,Y7193L,STATIONARY ENGINEER HELPER,ENGINEER (all other),STATIONARY ENGINEER HELPER\n\n Print (http://...
20134,Y7198M,STATIONARY ENGINEER II,ENGINEER (all other),STATIONARY ENGINEER II \n\n Print (http://a...
20135,Y7200A,STATIONARY ENGINEER CONTROLS SPECIALIST,ENGINEER (all other),STATIONARY ENGINEER CONTROLS SPECIALIST\n\n Pr...


#### **NLTK** is a leading platform for building Python programs to work with human language data. It provides a pre-processed stop word lists for various languages (https://www.geeksforgeeks.org/removing-stop-words-nltk-python/) - Below I will make a function that will help us get rid of stop words from any column, but specifically the now cleaned job title feature and eventually the job description feature

In [6]:
import nltk

In [7]:
from nltk.corpus import stopwords

In [8]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()  # split the text into words
    filtered_words = [word for word in words if word.lower() not in stop_words]  # remove stop words
    return " ".join(filtered_words)  # join the words back into a string

df['CleanedJobTitle'] = df['CleanedJobTitle'].apply(remove_stopwords)

df

Unnamed: 0,RequisitionID,CleanedJobTitle,JobTitle,JobDescription
0,,Licensed Stationary Engineer,ENGINEER (all other),Licensed Stationary Engineer \n\n Froedtert So...
1,00000224907,Guidance Navigation Control GNC Engineer Lead,ENGINEER (all other),**The Boeing Company** is in search of a **L...
2,00000331804,Propulsion Engineer Associate MidLevel Experie...,ENGINEER (all other),"**Job Description**\n\nAt Boeing, we innovate ..."
3,00000336462,Senior Process Controls Engineer,ENGINEER (all other),"**Job Description**\n\nAt Boeing, we innovate ..."
4,00000338951,RFMicrowave Engineer Level,ENGINEER (all other),"**Job Description**\n\nAt Boeing, we innovate ..."
...,...,...,...,...
20132,WYMP-23-12039375-MG,Mining Engineer,ENGINEER (all other),Summary Explore a new career with the BLM - wh...
20133,Y7193L,STATIONARY ENGINEER HELPER,ENGINEER (all other),STATIONARY ENGINEER HELPER\n\n Print (http://...
20134,Y7198M,STATIONARY ENGINEER II,ENGINEER (all other),STATIONARY ENGINEER II \n\n Print (http://a...
20135,Y7200A,STATIONARY ENGINEER CONTROLS SPECIALIST,ENGINEER (all other),STATIONARY ENGINEER CONTROLS SPECIALIST\n\n Pr...


#### Going to check to see if JobTitle column is all the same value. If it is, I will drop the feature as it won't be predictive of anything we want to predict

In [9]:
df['JobTitle'].unique()

array(['ENGINEER (all other)'], dtype=object)

In [10]:
df.drop('JobTitle', axis = 1, inplace = True)
df

Unnamed: 0,RequisitionID,CleanedJobTitle,JobDescription
0,,Licensed Stationary Engineer,Licensed Stationary Engineer \n\n Froedtert So...
1,00000224907,Guidance Navigation Control GNC Engineer Lead,**The Boeing Company** is in search of a **L...
2,00000331804,Propulsion Engineer Associate MidLevel Experie...,"**Job Description**\n\nAt Boeing, we innovate ..."
3,00000336462,Senior Process Controls Engineer,"**Job Description**\n\nAt Boeing, we innovate ..."
4,00000338951,RFMicrowave Engineer Level,"**Job Description**\n\nAt Boeing, we innovate ..."
...,...,...,...
20132,WYMP-23-12039375-MG,Mining Engineer,Summary Explore a new career with the BLM - wh...
20133,Y7193L,STATIONARY ENGINEER HELPER,STATIONARY ENGINEER HELPER\n\n Print (http://...
20134,Y7198M,STATIONARY ENGINEER II,STATIONARY ENGINEER II \n\n Print (http://a...
20135,Y7200A,STATIONARY ENGINEER CONTROLS SPECIALIST,STATIONARY ENGINEER CONTROLS SPECIALIST\n\n Pr...


## It is now time to focus on cleaining the Job Description feature, and as first suggested, I will remove the html tags and url, along with deal with punctuation, extra spaces, and stop words

### <u>Removing HTML tags in Pandas</u>
https://stackoverflow.com/questions/45999415/removing-html-tags-in-pandas

In [11]:
df['JobDescription'] = df['JobDescription'].str.replace(r'<[^<>]*>', '', regex=True)
df

Unnamed: 0,RequisitionID,CleanedJobTitle,JobDescription
0,,Licensed Stationary Engineer,Licensed Stationary Engineer \n\n Froedtert So...
1,00000224907,Guidance Navigation Control GNC Engineer Lead,**The Boeing Company** is in search of a **L...
2,00000331804,Propulsion Engineer Associate MidLevel Experie...,"**Job Description**\n\nAt Boeing, we innovate ..."
3,00000336462,Senior Process Controls Engineer,"**Job Description**\n\nAt Boeing, we innovate ..."
4,00000338951,RFMicrowave Engineer Level,"**Job Description**\n\nAt Boeing, we innovate ..."
...,...,...,...
20132,WYMP-23-12039375-MG,Mining Engineer,Summary Explore a new career with the BLM - wh...
20133,Y7193L,STATIONARY ENGINEER HELPER,STATIONARY ENGINEER HELPER\n\n Print (http://...
20134,Y7198M,STATIONARY ENGINEER II,STATIONARY ENGINEER II \n\n Print (http://a...
20135,Y7200A,STATIONARY ENGINEER CONTROLS SPECIALIST,STATIONARY ENGINEER CONTROLS SPECIALIST\n\n Pr...


#### Confirming it works with 5 random job descriptions, which I will check with the original dataframe

In [12]:
# Use `sample` to get 5 random rows 
random_rows = df.sample(n=5)

# Increase column width to display the full content
pd.set_option('display.max_colwidth', None)

# Print each row
for idx, row in random_rows.iterrows():
    print(f"Index: {idx}\nContent: {row['JobDescription']}\n---")

Index: 10432
Content: **JOB TITLE: SR. Structural Engineer**\n\n**NUMBER OF POSITIONS: 5**\n\n**WORK ADDRESS: Richland, WA 99354**\n\n**TELEWORK/REMOTE/ONSITE: Full Time Office / Project - No Telework offered.**\n\n**JOB DESCRIPTION:**\n\nOriginate and check complex structural engineering deliverables, such as steel and concrete calculations, specifications, material requisitions.\n\nReviews and coordinates technical documents with/from other engineering disciplines, procurement, and construction.\n\nPrepare technical reports related to specific design issues.\n\nManage specifically assigned project activities.\n\nAssist in constructability reviews.\n\nSupport construction by dispositioning field change requests and nonconformance reports.\n\nAct as a mentor or coach for subordinate engineers and designers.\n\nSchedule and coordinate off-project review requirements for engineering deliverables.\n\nLead and/or participate in technical discussions with client, suppliers, construction, et

#### Removing stop words next

In [13]:
df['JobDescription'] = df['JobDescription'].apply(remove_stopwords)

In [14]:
# Use `sample` to get 5 random rows 
random_rows = df.sample(n=5)

# Print each row
for idx, row in random_rows.iterrows():
    print(f"Index: {idx}\nContent: {row['JobDescription']}\n---")

Index: 15625
Content: **Job Description**\n\nAt BrightDrop, reshaping e-commerce developing smarter, greener, efficient ways deliver goods services door, delivering brighter future cities live. building ecosystem all-electric, zero-emissions delivery solutions - electric vehicles, ePallets software leverages real-time data drive intelligent optimizations e-commerce. deliver mission growing fast building team, based Palo Alto, offices Atlanta Detroit, customer-focused, agile passionate innovating sustainable future.\n\nFrom engineering product management operations, BrightDrop looking people combine passion technology sustainability high doses curiosity rigorous thinking deliver better future.\n\nBacked General Motors, BrightDrop striving improve communities live deliver better future generations come. hope join us.\n\nIn Senior Automation QE role, get work automated frameworks web, backend mobile platforms GM's new enterprise electrified fleet application management. quality-first team

In [16]:
pd.reset_option('display.max_colwidth')

#### Getting rid of certain punctuation, trying to limit the amount of extra whitespace, but allowing whitespace between certain removed punctuation marks, like \n, commas and colons

In [17]:
def clean_job_description_no_punctuation(title):
    # Replace newlines (\n) with space
    cleaned_title = re.sub(r'\\n', ' ', title)
    
    # Replace 'and/or' with space
    cleaned_title = re.sub(r'and/or', ' ', cleaned_title)
    
    # Replace characters like '+', '*' with space
    cleaned_title = re.sub(r'[+*]', '', cleaned_title)

    # Replace commas and colons with space
    cleaned_title = re.sub(r'[:,]', ' ', cleaned_title)
    
    # Keep only alphanumeric characters, whitespaces, and sentence-ending punctuation (.!?).
    sentence_end_punct = '.!?'
    allowed_characters = f"{string.ascii_letters}{string.digits}{string.whitespace}{sentence_end_punct}"
    pattern = f"[^{allowed_characters}]"

    # Replace other unwanted characters with nothing (since you want to keep space only for \n, comma, and colon)
    cleaned_title = re.sub(pattern, '', cleaned_title)
    
    # Remove any leading or trailing spaces
    cleaned_title = cleaned_title.strip()
    
    return cleaned_title

df['JobDescription'] = df['JobDescription'].apply(clean_job_description_no_punctuation)
df 

Unnamed: 0,RequisitionID,CleanedJobTitle,JobDescription
0,,Licensed Stationary Engineer,Licensed Stationary Engineer Froedtert Sout...
1,00000224907,Guidance Navigation Control GNC Engineer Lead,The Boeing Company search Lead Guidance Naviga...
2,00000331804,Propulsion Engineer Associate MidLevel Experie...,Job Description At Boeing innovate collabora...
3,00000336462,Senior Process Controls Engineer,Job Description At Boeing innovate collabora...
4,00000338951,RFMicrowave Engineer Level,Job Description At Boeing innovate collabora...
...,...,...,...
20132,WYMP-23-12039375-MG,Mining Engineer,Summary Explore new career BLM people preciou...
20133,Y7193L,STATIONARY ENGINEER HELPER,STATIONARY ENGINEER HELPER Print http agency...
20134,Y7198M,STATIONARY ENGINEER II,STATIONARY ENGINEER II Print http agency.go...
20135,Y7200A,STATIONARY ENGINEER CONTROLS SPECIALIST,STATIONARY ENGINEER CONTROLS SPECIALIST Prin...


In [18]:
# Use `sample` to get 5 random rows 
random_rows = df.sample(n=5)

# Increase column width to display the full content
pd.set_option('display.max_colwidth', None)

# Print each row
for idx, row in random_rows.iterrows():
    print(f"Index: {idx}\nContent: {row['JobDescription']}\n---")

Index: 17537
Content: Electro Optic Infrared Engineer  Lexington Park   Maryland   USA  Apply https careers.boozallen.comjobsJobDetailApply?jobId75710sourceCWS17380    you   Booz Allen https www.boozallen.comcareers.html    empowered   Learn More  Job Description  Location Lexington Park  Maryland  USA  Remote Work No  Job Number R0165836
---
Index: 14742
---
Index: 13852
Content: APTIM  come work day knowing making impact world. work spans safeguarding maintaining critical infrastructure helping communities recover natural disasters  empowering armed forces first responders reducing carbon energy use  making cities resilient threats climate change restoring contaminated ecological systems.  Job Overview   A Construction Field Engineers help direct execute work engineering disciplines field support construction projects. required  interface coordinate activities home office engineering  field construction supervision  field subcontractors. Additionally  may support Field Procurement  S

In [19]:
pd.reset_option('display.max_colwidth')

#### Let's now get rid of all urls 

### <u>Removing URLs in Pandas</u>
https://stackoverflow.com/questions/51994254/removing-url-from-a-column-in-pandas-dataframe

In [20]:
df['JobDescription'] = (df['JobDescription']
                         .replace(r'http\S+', '', regex=True)
                         .replace(r'www\S+', '', regex=True)
                         .replace(r'\b\w+\.\w+\b', '', regex=True))

In [23]:
# Use `sample` to get 5 random rows 
random_rows = df.sample(n=5)

# Increase column width to display the full content
pd.set_option('display.max_colwidth', None)

# Print each row
for idx, row in random_rows.iterrows():
    print(f"Index: {idx}\nContent: {row['JobDescription']}\n---")

Index: 11856
Content: Description  We are Lockheed Martin  Flight Test Engineer supporting F16 aircraft Test Conductor within F16 CTF  Edwards AFB  CA. Position work military  contractor  civil service personnel members flight test team.  Duties responsibilities include planning  scheduling  executing flight test mission requirements by    Developing publishing test plans flight test cards.   Determining communicating aircraft test requirements configuration responsible engineering  maintenance  instrumentation  flight operations personnel ensure aircraft readiness.   Complying published test safety plans standard operating procedures.   Conducting test missions performing test mission briefings debriefings  meeting qualification requirements.   Evaluating results flight activities incorporating lessons learned future test activities.  A level 3 employee Typically 5  10 years professional experience.  Whats You  Our employees play active role strengthening quality life live work volunt

In [24]:
# Replace multiple spaces with a single space
df['JobDescription'] = df['JobDescription'].str.replace(r'\s+', ' ', regex=True)

In [26]:
# Use `sample` to get 5 random rows for JobDescription
random_rows = df.sample(n=5)

# Increase column width to display the full content
pd.set_option('display.max_colwidth', None)

# Print each row
for idx, row in random_rows.iterrows():
    print(f"Index: {idx}\nContent: {row['JobDescription']}\n---")

Index: 7291
Content: Sr. Wastewater Engineer 230001O7 Description Many worlds top engineers scientists come together Water business view communitys interaction water bit differentlyas single holistic system rather unconnected networks divided jurisdictional boundaries. Working throughout hydrologic cycle delivering sustainable solutions make sure water available needed. experts guide work scientific rigor innovative spirit vision growth. Every day help communities improve reuse protect precious resource future generations. 2025 were hiring 2 000 people like you join team. Grow us H2OU. Your Opportunity The role Senior Wastewater Engineer work independently smaller projects team member larger projects. able evaluate select specify engineer systems products project. perform variety tasks may include calculations design sketches concepts report preparation field work. Projects assigned may complex features require application mature knowledge. lead analysis proposed projects ensure struct

In [27]:
# Use `sample` to get 5 random rows for CleanedJobTitle
random_rows = df.sample(n=5)

# Increase column width to display the full content
pd.set_option('display.max_colwidth', None)

# Print each row
for idx, row in random_rows.iterrows():
    print(f"Index: {idx}\nContent: {row['CleanedJobTitle']}\n---")

Index: 5621
Content: DevSecOps Research Engineer
---
Index: 3133
Content: Customer Engineer Premier Windows Client Government
---
Index: 8294
Content: Senior Engineer
---
Index: 16442
Content: Senior Infrastructure Engineer
---
Index: 1719
Content: Architecture System Integration Engineer Accelerator ASIC
---


In [28]:
pd.reset_option('display.max_colwidth')

#### I want to check if every ID is unique, and delete all the rows that have a repeated value, assuming that row with the same requisition id is the same job and duplicates can be removed (ORIGINAL ENGINEER CSV DID HAVE DUPLICATES, THE NEWEST ONE DOESN'T)

In [29]:
df['RequisitionID'].nunique()

20136

#### Making a new variable for the pandas dataframe I want to download 

In [31]:
# Rename Job Description feature for new CSV naming purposes
cleaned_df = df.rename(columns={'OrigJobTitle': 'CleanedJobTitle'})
cleaned_df

Unnamed: 0,RequisitionID,CleanedJobTitle,JobDescription
0,,Licensed Stationary Engineer,Licensed Stationary Engineer Froedtert South K...
1,00000224907,Guidance Navigation Control GNC Engineer Lead,The Boeing Company search Lead Guidance Naviga...
2,00000331804,Propulsion Engineer Associate MidLevel Experie...,Job Description At Boeing innovate collaborate...
3,00000336462,Senior Process Controls Engineer,Job Description At Boeing innovate collaborate...
4,00000338951,RFMicrowave Engineer Level,Job Description At Boeing innovate collaborate...
...,...,...,...
20132,WYMP-23-12039375-MG,Mining Engineer,Summary Explore new career BLM people precious...
20133,Y7193L,STATIONARY ENGINEER HELPER,STATIONARY ENGINEER HELPER Print http .?jobID3...
20134,Y7198M,STATIONARY ENGINEER II,STATIONARY ENGINEER II Print http .?jobID29656...
20135,Y7200A,STATIONARY ENGINEER CONTROLS SPECIALIST,STATIONARY ENGINEER CONTROLS SPECIALIST Print ...


#### Downloading new csv and testing to see if it works 

In [32]:
cleaned_df.to_csv('New_Cleaned_Engineer.csv', index = False)

In [33]:
new_df = pd.read_csv('New_Cleaned_Engineer.csv')
new_df.head()

Unnamed: 0,RequisitionID,CleanedJobTitle,JobDescription
0,,Licensed Stationary Engineer,Licensed Stationary Engineer Froedtert South K...
1,224907.0,Guidance Navigation Control GNC Engineer Lead,The Boeing Company search Lead Guidance Naviga...
2,331804.0,Propulsion Engineer Associate MidLevel Experie...,Job Description At Boeing innovate collaborate...
3,336462.0,Senior Process Controls Engineer,Job Description At Boeing innovate collaborate...
4,338951.0,RFMicrowave Engineer Level,Job Description At Boeing innovate collaborate...


In [34]:
new_df

Unnamed: 0,RequisitionID,CleanedJobTitle,JobDescription
0,,Licensed Stationary Engineer,Licensed Stationary Engineer Froedtert South K...
1,00000224907,Guidance Navigation Control GNC Engineer Lead,The Boeing Company search Lead Guidance Naviga...
2,00000331804,Propulsion Engineer Associate MidLevel Experie...,Job Description At Boeing innovate collaborate...
3,00000336462,Senior Process Controls Engineer,Job Description At Boeing innovate collaborate...
4,00000338951,RFMicrowave Engineer Level,Job Description At Boeing innovate collaborate...
...,...,...,...
20132,WYMP-23-12039375-MG,Mining Engineer,Summary Explore new career BLM people precious...
20133,Y7193L,STATIONARY ENGINEER HELPER,STATIONARY ENGINEER HELPER Print http .?jobID3...
20134,Y7198M,STATIONARY ENGINEER II,STATIONARY ENGINEER II Print http .?jobID29656...
20135,Y7200A,STATIONARY ENGINEER CONTROLS SPECIALIST,STATIONARY ENGINEER CONTROLS SPECIALIST Print ...


In [35]:
df['RequisitionID']

0                        NaN
1                00000224907
2                00000331804
3                00000336462
4                00000338951
                ...         
20132    WYMP-23-12039375-MG
20133                 Y7193L
20134                 Y7198M
20135                 Y7200A
20136                 Y7203D
Name: RequisitionID, Length: 20137, dtype: object

In [36]:
df['CleanedJobTitle']

0                             Licensed Stationary Engineer
1            Guidance Navigation Control GNC Engineer Lead
2        Propulsion Engineer Associate MidLevel Experie...
3                         Senior Process Controls Engineer
4                               RFMicrowave Engineer Level
                               ...                        
20132                                      Mining Engineer
20133                           STATIONARY ENGINEER HELPER
20134                               STATIONARY ENGINEER II
20135              STATIONARY ENGINEER CONTROLS SPECIALIST
20136                            CHIEF STATIONARY ENGINEER
Name: CleanedJobTitle, Length: 20137, dtype: object

In [37]:
df['JobDescription']

0        Licensed Stationary Engineer Froedtert South K...
1        The Boeing Company search Lead Guidance Naviga...
2        Job Description At Boeing innovate collaborate...
3        Job Description At Boeing innovate collaborate...
4        Job Description At Boeing innovate collaborate...
                               ...                        
20132    Summary Explore new career BLM people precious...
20133    STATIONARY ENGINEER HELPER Print http .?jobID3...
20134    STATIONARY ENGINEER II Print http .?jobID29656...
20135    STATIONARY ENGINEER CONTROLS SPECIALIST Print ...
20136    CHIEF STATIONARY ENGINEER Print http .?jobID36...
Name: JobDescription, Length: 20137, dtype: object