In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
from scipy import stats

In [4]:
df= pd.read_csv("../data/raw/dataset_synapse.csv")

In [5]:
df.head()

Unnamed: 0,JobTitle,Standard_Title,CompanyName,Location,Salary_Range_INR (lakhs),Skills_Required,Posting_Date,Source_URL,role_volume
0,Software Engineer,Software Engineer,Google,"Gurugram, Bengaluru",27.8-48,"Unix, C++, Linux, Networking, Machine learning...",1 month,https://www.naukri.com/job-listings-software-e...,102707
1,Software Engineer,Software Engineer,Hewlett Packard Enterprise (HPE),Bengaluru,9.8- 17.2,"Computer science, Development Manager, Coding,...",1 week,https://www.naukri.com/job-listings-graduate-s...,102707
2,Software Engineer,Software Engineer,Coriolis Tech,Pune,3.5- 5.5,"Github, Debugging, Software Development, Life ...",1 day,https://www.naukri.com/job-listings-hiring-for...,102707
3,Software Developer,Software Developer,Tetcos,Bengaluru,4- 5.7,"C#, Multi Treading, C Coding, Networking Proto...",1 day,https://www.naukri.com/job-listings-software-d...,98619
4,Software Developer,Software Developer,Amazon,Chennai,20.2- 34,"Unix, Computer science, Object oriented design...",1 month,https://www.naukri.com/job-listings-software-d...,98619


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   JobTitle                  35 non-null     object
 1   Standard_Title            35 non-null     object
 2   CompanyName               35 non-null     object
 3   Location                  35 non-null     object
 4   Salary_Range_INR (lakhs)  35 non-null     object
 5   Skills_Required           35 non-null     object
 6   Posting_Date              35 non-null     object
 7   Source_URL                35 non-null     object
 8   role_volume               35 non-null     int64 
dtypes: int64(1), object(8)
memory usage: 2.6+ KB


In [7]:
def clean_salary(salary_range):
    low, high = [float(s.strip()) for s in salary_range.split('-')]
    return (low + high) / 2 * 100000

df['avg_salary_inr'] = df['Salary_Range_INR (lakhs)'].apply(clean_salary)

In [8]:
def parse_skills(skills_text):
    if not isinstance(skills_text, str):
        return []
    cleaned_text = skills_text.replace('\n', ' ').strip()

    skills_list = [skill.strip() for skill in cleaned_text.split(',')]
    return skills_list

df['skills_list'] = df['Skills_Required'].apply(parse_skills)

In [9]:
def standardize_date(date_str):
    if not isinstance(date_str, str):
        return pd.NaT # Not a Time for missing values
    
    today = datetime.now()
    date_str = date_str.lower()
    
    try:
        if 'day' in date_str:
            days_ago = int(date_str.split()[0])
            return today - timedelta(days=days_ago)
        elif 'week' in date_str:
            weeks_ago = int(date_str.split()[0])
            return today - timedelta(weeks=weeks_ago)
        elif 'month' in date_str:
            months_ago = int(date_str.split()[0])
            # Approximation: 30 days per month
            return today - timedelta(days=months_ago * 30)
        else:
            return pd.NaT # Return NaT if format is not recognized
    except (ValueError, IndexError):
        return pd.NaT

df['standard_date'] = df['Posting_Date'].apply(standardize_date)

In [10]:
df_cleaned = df.drop(columns=['Salary_Range_INR (lakhs)', 'Skills_Required', 'Posting_Date'])

In [11]:
df_cleaned.head()

Unnamed: 0,JobTitle,Standard_Title,CompanyName,Location,Source_URL,role_volume,avg_salary_inr,skills_list,standard_date
0,Software Engineer,Software Engineer,Google,"Gurugram, Bengaluru",https://www.naukri.com/job-listings-software-e...,102707,3790000.0,"[Unix, C++, Linux, Networking, Machine learnin...",2025-08-07 21:52:33.490587
1,Software Engineer,Software Engineer,Hewlett Packard Enterprise (HPE),Bengaluru,https://www.naukri.com/job-listings-graduate-s...,102707,1350000.0,"[Computer science, Development Manager, Coding...",2025-08-30 21:52:33.490587
2,Software Engineer,Software Engineer,Coriolis Tech,Pune,https://www.naukri.com/job-listings-hiring-for...,102707,450000.0,"[Github, Debugging, Software Development, Life...",2025-09-05 21:52:33.490587
3,Software Developer,Software Developer,Tetcos,Bengaluru,https://www.naukri.com/job-listings-software-d...,98619,485000.0,"[C#, Multi Treading, C Coding, Networking Prot...",2025-09-05 21:52:33.490587
4,Software Developer,Software Developer,Amazon,Chennai,https://www.naukri.com/job-listings-software-d...,98619,2710000.0,"[Unix, Computer science, Object oriented desig...",2025-08-07 21:52:33.490587


In [12]:
def get_trend_slope(csv_file_path):
    try:
        trends_df = pd.read_csv(csv_file_path, skiprows=2)
        trends_df.columns = ['Month', 'Interest']
        trends_df['Interest'] = pd.to_numeric(trends_df['Interest'], errors='coerce').fillna(0)
        trends_df['Time_Index'] = range(len(trends_df))
        
        # Perform linear regression to find the slope
        slope, intercept, r_value, p_value, std_err = stats.linregress(
            trends_df['Time_Index'],
            trends_df['Interest']
        )
        return slope
    except FileNotFoundError:
        print(f"Warning: File not found at {csv_file_path}. Returning slope of 0.")
        return 0
    except Exception as e:
        print(f"An error occurred with file {csv_file_path}: {e}")
        return 0


In [13]:
trends_file_mapping = {
    "Software Engineer": "../data/raw/trends/trends_software_engineer.csv",
    "Software Developer": "../data/raw/trends/trends_software_developer.csv",
    "Frontend Developer" : "../data/raw/trends/trends_frontend_developer.csv",
    "FullStack Developer" : "../data/raw/trends/trends_fullstack_developer.csv",
    "Backend Developer" : "../data/raw/trends/trends_backend_developer.csv",
    "Data Analyst": "../data/raw/trends/trends_data_analyst.csv",
    "AI/ML Engineer": "../data/raw/trends/trends_aiml_engineer.csv",
    "DevOps": "../data/raw/trends/trends_devops.csv",
    "CyberSecurity Analyst" : "../data/raw/trends/trends_cybersecurity_analyst.csv",
    "Business Analyst" : "../data/raw/trends/trends_business_analyst.csv",
    "App Developer" : "../data/raw/trends/trends_app_developer.csv",
    "Software Tester" : "../data/raw/trends/trends_software_tester.csv",
    "Technical Support" : "../data/raw/trends/trends_technical_support.csv", 
    "Network Engineer" : "../data/raw/trends/trends_network_engineer.csv" ,
    "Product Engineer" : "../data/raw/trends/trends_product_engineer.csv"
}

In [14]:
df_cleaned['fgm_score'] = df_cleaned['Standard_Title'].apply(
    lambda x: get_trend_slope(trends_file_mapping.get(x, None)) if trends_file_mapping.get(x) else 0
)


In [15]:
df_cleaned.groupby('Standard_Title')['fgm_score'].first()

Standard_Title
AI/ML Engineer           0.236254
App Developer            0.055563
Backend Developer        0.233981
Business Analyst         0.016538
CyberSecurity Analyst    0.289981
Data Analyst             0.165257
DevOps                   0.118873
Frontend Developer       0.344176
FullStack Developer      0.289331
Network Engineer         0.115382
Product Engineer         0.127651
Software Developer       0.007628
Software Engineer        0.465592
Software Tester         -0.020765
Technical Support        0.034056
Name: fgm_score, dtype: float64

In [16]:
df_cleaned

Unnamed: 0,JobTitle,Standard_Title,CompanyName,Location,Source_URL,role_volume,avg_salary_inr,skills_list,standard_date,fgm_score
0,Software Engineer,Software Engineer,Google,"Gurugram, Bengaluru",https://www.naukri.com/job-listings-software-e...,102707,3790000.0,"[Unix, C++, Linux, Networking, Machine learnin...",2025-08-07 21:52:33.490587,0.465592
1,Software Engineer,Software Engineer,Hewlett Packard Enterprise (HPE),Bengaluru,https://www.naukri.com/job-listings-graduate-s...,102707,1350000.0,"[Computer science, Development Manager, Coding...",2025-08-30 21:52:33.490587,0.465592
2,Software Engineer,Software Engineer,Coriolis Tech,Pune,https://www.naukri.com/job-listings-hiring-for...,102707,450000.0,"[Github, Debugging, Software Development, Life...",2025-09-05 21:52:33.490587,0.465592
3,Software Developer,Software Developer,Tetcos,Bengaluru,https://www.naukri.com/job-listings-software-d...,98619,485000.0,"[C#, Multi Treading, C Coding, Networking Prot...",2025-09-05 21:52:33.490587,0.007628
4,Software Developer,Software Developer,Amazon,Chennai,https://www.naukri.com/job-listings-software-d...,98619,2710000.0,"[Unix, Computer science, Object oriented desig...",2025-08-07 21:52:33.490587,0.007628
5,Software Developer,Software Developer,Nokia,Bengaluru,https://www.naukri.com/job-listings-software-d...,98619,845000.0,"[c++, rtos, linux, debugging, software develop...",2025-08-23 21:52:33.490587,0.007628
6,JavaScript front end developer,Frontend Developer,Avom Consultants,Bengaluru,https://www.naukri.com/job-listings-javascript...,38966,1700000.0,"[css, react js, Ember.Js, Rxjs, Backbone.Js, V...",2025-08-30 21:52:33.490587,0.344176
7,Frontend Developer,Frontend Developer,All In One Consultancy,Nagpur,https://www.naukri.com/job-listings-front-end-...,38966,362500.0,"[Frontend Development, Typescript, Javascript,...",2025-08-30 21:52:33.490587,0.344176
8,Frontend Developer,Frontend Developer,Nasiwak Services India Pvt Ltd,Bengaluru,https://www.naukri.com/job-listings-front-end-...,38966,450000.0,"[Python, CSS, Javascript, HTML, React.Js]",2025-08-30 21:52:33.490587,0.344176
9,Fullstack Web Developer (MERN/Django),FullStack Developer,Krishworks Technology And Research Labs,Bengaluru,https://www.naukri.com/job-listings-fullstack-...,2270,600000.0,"[Mern Stack, coding, Mean Stack, Django, probl...",2025-08-16 21:52:33.490587,0.289331


In [17]:
output_path = '../data/processed/market_intelligence_db.csv'
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df_cleaned.to_csv(output_path, index=False)
