In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast  
import seaborn as sns

In [None]:
df = pd.read_csv(r'D:\Python\data_jobs.csv')
new_columns = ['job', 'full_name', 'location', 'via', 'schedule', 'work_from_home','search_location','posted_dt','no_degree','health_ins','country','salrate','salyr','salhr','company','skills','skilltype']
df.columns = new_columns

# data clean up
df['posted_dt'] = pd.to_datetime(df['posted_dt'])                                                                # converting the column from string format to date time format
df['skills'] = df['skills'].apply(lambda gg: ast.literal_eval(gg) if pd.notna(gg) else gg)                       # converts job_skills from sting to list
df.dropna(subset=['salyr'],inplace=True)

##### ind

In [None]:
df_ind = df[(df['job']== 'Data Analyst') & (df['country']== 'India')].copy()
df_ind_exp = df_ind.explode('skills')
df_ind_exp

In [None]:
df_ind_table = df_ind_exp.groupby('skills')['salyr'].agg(['median', 'count']).sort_values(by='count', ascending=False)

In [None]:
df_ind_table['skill_percent'] = df_ind_table['count']/len(df_ind) * 100

In [None]:
df_ind_table

converting skilltype column to dict that has all related slills

In [None]:
df_tech = df_ind['skilltype'].copy()

# remove duplicates
df_tech = df_tech.drop_duplicates()

# remove NaN values
df_tech = df_tech.dropna()

# combine all dictionaries into one
tech_dict = {}
for row in df_tech:
    row_dict = ast.literal_eval(row)  # convert string to dictionary
    for key, value in row_dict.items():
        if key in tech_dict:  # if key already exists in tech_dict, add value to existing value
            tech_dict[key] += value
        else:                       # if key does not exist in tech_dict, add key and value
            tech_dict[key] = value

# remove duplicates by converting values to set then back to list
for key, value in tech_dict.items():
    tech_dict[key] = list(set(value))

tech_dict

In [None]:
# turn dictionary into dataframe
df_tech = pd.DataFrame(list(tech_dict.items()), columns=['technology', 'skills'])
df_tech = df_tech.explode('skills')
df_tech

In [None]:
# merge df_DA_skills and df_technology
df_ind_table_tech = df_ind_table.merge(df_tech, left_on='skills', right_on='skills')
df_ind_table_tech

In [None]:
df_ind_plot = df_ind_table_tech[df_ind_table_tech['skill_percent'] > 5]
df_ind_plot.set_index('skills', inplace= True)
df_ind_plot

In [None]:

from adjustText import adjust_text
from matplotlib.ticker import PercentFormatter

# Initialize the plot
plt.figure(figsize=(10, 6))

# Scatter plot
sns.scatterplot(
    data=df_ind_plot,
    x='skill_percent',
    y='median',
    hue='technology'
)

# Despine and set theme
sns.despine()
sns.set_theme(style='ticks')

# Prepare texts for adjuktText
texts = []
for i, txt in enumerate(df_ind_plot.index):
    texts.append(plt.text(df_ind_plot['skill_percent'].iloc[i], df_ind_plot['median'].iloc[i], txt))

# Adjukt text to avoid overlap
adjust_text(texts, arrowprops=dict(arrowstyle='->', color='gray'))

# Set axis labels, title, and legend
plt.xlabel('How Often Skills Appear in Job Postings')
plt.ylabel('Median Yearly Salary')
plt.title('Most Optimal Skills for Data Analysts in the India')
plt.legend(title='Technology')

# Format axis
ax = plt.gca()
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, pos: f'${int(y/1000)}K'))
ax.xaxis.set_major_formatter(PercentFormatter(decimals=0))

# Adjukt layout and display plot 
plt.tight_layout()
plt.show()


plotting for top 4 Data jobs 

In [None]:
top4_jobs = df['job'].value_counts().head(4).index.to_list()
job_country = ['India']

print(top4_jobs)
print(job_country)

In [None]:

dict_df = {}
for i in top4_jobs:

    df_da = df[(df['job']== i) & (df['country']== 'India' )].copy()
    df_da_exp = df_da.explode('skills')
    df_da_table = df_da_exp.groupby('skills')['salyr'].agg(['median', 'count']).sort_values(by='count', ascending=False)
    df_da_table['skill_percent'] = df_da_table['count']/len(df_da) * 100


    df_tech = df_da['skilltype'].copy()
    df_tech = df_tech.drop_duplicates()
    df_tech = df_tech.dropna()
    
    tech_dict = {}
    for row in df_tech:
        row_dict = ast.literal_eval(row)  # convert string to dictionary
        for key, value in row_dict.items():
            if key in tech_dict:  # if key already exists in tech_dict, add value to existing value
                tech_dict[key] += value
            else:                       # if key does not exist in tech_dict, add key and value
                tech_dict[key] = value
    for key, value in tech_dict.items():
        tech_dict[key] = list(set(value))


    df_tech = pd.DataFrame(list(tech_dict.items()), columns=['technology', 'skills'])
    df_tech = df_tech.explode('skills')

    df_da_table_tech = df_da_table.merge(df_tech, left_on='skills', right_on='skills')

    df_da_plot = df_da_table_tech[df_da_table_tech['skill_percent'] > 15] if i != 'Data Engineer' else df_da_table_tech[df_da_table_tech['count'] > 40]
    df_da_plot.set_index('skills', inplace= True)


    dict_df[i] = df_da_plot


In [None]:
dict_df['Data Engineer']

In [None]:

from adjustText import adjust_text
from matplotlib.ticker import PercentFormatter


fig, ax = plt.subplots(2, 2, figsize= (12,8)) 

positions = [(0, 0), (0, 1), (1, 0), (1, 1)] 

for index, (key, value) in enumerate(dict_df.items()):
    if index < len(positions):
        i, j = positions[index]
        sns.scatterplot(data=value, x='skill_percent', y='median', hue='technology', ax=ax[i, j])

        texts = []
        for p, txt in enumerate(value.index):
            texts.append(ax[i,j].text(value['skill_percent'].iloc[p], value['median'].iloc[p], txt, fontsize=9))

        # Adjust text to avoid overlap
        adjust_text(texts,ax=ax[i,j],arrowprops=dict(arrowstyle='->', color='gray'))


        ax[i,j].xaxis.set_major_formatter(PercentFormatter(decimals=0))
        ax[i, j].yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'${int(y/1000)}K'))
        ax[i,j].set_title(key)
        ax[i, j].legend(fontsize='small')

ax[0,0].set_xlabel('')
ax[0,1].set_xlabel('')
ax[1,0].set_xlabel('How Often Skills Appear in Job Postings')
ax[1,1].set_xlabel('How Often Skills Appear in Job Postings')
ax[0,0].set_ylabel('Median Salary')
ax[1,0].set_ylabel('Median Salary')
ax[0,1].set_ylabel('')
ax[1,1].set_ylabel('')

        
# Adjust layout and display the plot
plt.suptitle('Optimal Skills to Learn for Top 4 Data Jobs in India',y=1)
plt.tight_layout()
plt.show()


In [None]:
from adjustText import adjust_text                                      # Import adjust_text for adjusting text labels to avoid overlap
from matplotlib.ticker import PercentFormatter                          # Import PercentFormatter for formatting y-axis values as percentages

fig, ax = plt.subplots(2, 2, figsize=(12, 8)) # Create a 2x2 grid of subplots with a figure size of 12x8 inches


positions = [(0, 0), (0, 1), (1, 0), (1, 1)] # Define the positions for the subplots in the 2x2 grid


for index, (key, value) in enumerate(dict_df.items()): # Iterate through each item in the dictionary dict_df

    if index < len(positions):    # Check if the current index is within the range of defined positions

        i, j = positions[index]         # Get the subplot grid position

        sns.scatterplot(data=value, x='skill_percent', y='median', hue='technology', ax=ax[i, j])         # Create a scatter plot on the specified subplot with data from the dictionary

         # Adjust text positions to avoid overlap
        texts = []
        for p, txt in enumerate(value.index):         # Iterate through the index of the DataFrame to add text labels

            # Add text labels to the scatter plot at the data points
            texts.append(ax[i, j].text(value['skill_percent'].iloc[p], value['median'].iloc[p], txt, fontsize=9))

        # Adjust text positions to avoid overlap, with arrows indicating adjustments
        adjust_text(texts, ax=ax[i, j], arrowprops=dict(arrowstyle='->', color='gray'))

        # Format the x-axis to show percentages with no decimal places
        ax[i, j].xaxis.set_major_formatter(PercentFormatter(decimals=0))
        # Format the y-axis to display values as thousands of dollars (e.g., $100K)
        ax[i, j].yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'${int(y/1000)}K'))
        # Set the title of the subplot to the current key from the dictionary
        ax[i, j].set_title(key)
        # Add a legend to the subplot with a small font size
        ax[i, j].legend(fontsize='small')

# Customize x-axis and y-axis labels for each subplot
ax[0, 0].set_xlabel('')
ax[0, 1].set_xlabel('')
ax[1, 0].set_xlabel('How Often Skills Appear in Job Postings')
ax[1, 1].set_xlabel('How Often Skills Appear in Job Postings')
ax[0, 0].set_ylabel('Median Salary')
ax[1, 0].set_ylabel('Median Salary')
ax[0, 1].set_ylabel('')
ax[1, 1].set_ylabel('')

# Add a title for the entire figure and adjust its vertical position
plt.suptitle('Optimal Skills to Learn for Top 4 Data Jobs in India', y=1)

# Adjust subplot parameters to give specified padding and prevent overlaps
plt.tight_layout()

# Display the figure with all subplots and settings
plt.show()
