In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split


In [30]:
# Load the dataset
df = pd.read_csv("C:/3rd year materials/machine laerning/jobs_in_data.csv")
print(df.head())



   work_year             job_title                    job_category  \
0       2023  Data DevOps Engineer                Data Engineering   
1       2023        Data Architect  Data Architecture and Modeling   
2       2023        Data Architect  Data Architecture and Modeling   
3       2023        Data Scientist       Data Science and Research   
4       2023        Data Scientist       Data Science and Research   

  salary_currency   salary  salary_in_usd employee_residence experience_level  \
0             EUR      NaN        95012.0            Germany        Mid-level   
1             USD      NaN       186000.0      United States           Senior   
2             USD  81800.0            NaN      United States           Senior   
3             USD      NaN       212000.0      United States           Senior   
4             USD      NaN        93300.0      United States           Senior   

  employment_type work_setting company_location company_size  
0       Full-time       Hybri

In [6]:
data_info = {
    "shape": df.shape,
    "column_names": df.columns.tolist(),
    "data_types": df.dtypes,
    "missing_values": df.isnull().sum(),
    "sample_data": df.head()
}

data_info

{'shape': (9355, 12),
 'column_names': ['work_year',
  'job_title',
  'job_category',
  'salary_currency',
  'salary',
  'salary_in_usd',
  'employee_residence',
  'experience_level',
  'employment_type',
  'work_setting',
  'company_location',
  'company_size'],
 'data_types': work_year               int64
 job_title              object
 job_category           object
 salary_currency        object
 salary                float64
 salary_in_usd         float64
 employee_residence     object
 experience_level       object
 employment_type        object
 work_setting           object
 company_location       object
 company_size           object
 dtype: object,
 'missing_values': work_year                0
 job_title               26
 job_category             6
 salary_currency         12
 salary                3839
 salary_in_usd          820
 employee_residence       6
 experience_level        10
 employment_type         11
 work_setting            10
 company_location         8
 company

In [31]:
print(df.isnull().sum())


work_year                0
job_title               26
job_category             6
salary_currency         12
salary                3839
salary_in_usd          820
employee_residence       6
experience_level        10
employment_type         11
work_setting            10
company_location         8
company_size            10
dtype: int64


In [32]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Step 1: Identify numeric and categorical columns
numeric_cols = df.select_dtypes(include=['number']).columns  # Handles all numeric types
categorical_cols = df.select_dtypes(include=['object', 'bool', 'category']).columns

# Step 2: Handle columns with all missing values separately
# For numeric columns with all NaNs, we can fill them with a constant value (e.g., 0 or a placeholder like -1)
df[numeric_cols] = df[numeric_cols].fillna(0)  # Or use -1, or other placeholders if needed

# For categorical columns with all NaNs, we can fill them with a constant value like "Unknown" or any other placeholder
df[categorical_cols] = df[categorical_cols].fillna('Unknown')

# Step 3: Apply imputers to columns that still have missing values
# Re-create imputers
num_imputer = SimpleImputer(strategy='mean')  # Use 'median' if preferred
cat_imputer = SimpleImputer(strategy='most_frequent')

# Apply imputers only for columns with non-missing values
df[numeric_cols] = num_imputer.fit_transform(df[numeric_cols])
df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

# Step 4: Check for remaining missing values (optional)
print("Remaining missing values:")
print(df.isnull().sum())

# Optional: Monitor imputed columns
print("Imputed numeric columns:", numeric_cols)
print("Imputed categorical columns:", categorical_cols)

# Optional: Assert no missing values remain
assert df.isnull().sum().sum() == 0, "Still missing values present!"


Remaining missing values:
work_year             0
job_title             0
job_category          0
salary_currency       0
salary                0
salary_in_usd         0
employee_residence    0
experience_level      0
employment_type       0
work_setting          0
company_location      0
company_size          0
dtype: int64
Imputed numeric columns: Index(['work_year', 'salary', 'salary_in_usd'], dtype='object')
Imputed categorical columns: Index(['job_title', 'job_category', 'salary_currency', 'employee_residence',
       'experience_level', 'employment_type', 'work_setting',
       'company_location', 'company_size'],
      dtype='object')


In [33]:
df_new = df.drop(['salary', 'employee_residence', 'job_category','salary_currency'], axis=1)


In [34]:
print(df_new.columns)


Index(['work_year', 'job_title', 'salary_in_usd', 'experience_level',
       'employment_type', 'work_setting', 'company_location', 'company_size'],
      dtype='object')


In [35]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# List of selected features
selected_features = ['work_year', 'job_title', 'salary_in_usd', 
                     'experience_level', 'employment_type', 'work_setting', 
                     'company_location', 'company_size']

# Subset the DataFrame to include only the selected features
df_subset = df[selected_features]

# 1. Label encode 'experience_level' (ordinal: 'Junior', 'Mid-level', 'Senior' -> 0, 1, 2)
label_encoder = LabelEncoder()
df_subset['experience_level'] = label_encoder.fit_transform(df_subset['experience_level'])

# 2. Label encode 'company_size' (ordinal: 'Small', 'Medium', 'Large' -> 0, 1, 2)
df_subset['company_size'] = label_encoder.fit_transform(df_subset['company_size'])

# 3. Perform One-Hot Encoding on the remaining categorical features (like job_title, salary_currency, etc.)
df_encoded = pd.get_dummies(df_subset, drop_first=True)

# Split into training and testing data
X = df_encoded.drop('salary_in_usd', axis=1)  # Assuming 'salary_in_usd' is your target variable
y = df_encoded['salary_in_usd']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure that X_test has the same columns as X_train (with the same one-hot encoding)
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Evaluate the model on the test set
r2 = model.score(X_test, y_test)
print(f"R² Score: {r2:.3f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['experience_level'] = label_encoder.fit_transform(df_subset['experience_level'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['company_size'] = label_encoder.fit_transform(df_subset['company_size'])


R² Score: 0.165


In [36]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model with the scaled data
model.fit(X_train_scaled, y_train)
r2 = model.score(X_test_scaled, y_test)
print(f"R² Score: {r2:.3f}")


R² Score: 0.165


In [37]:
# Create a DataFrame to compare actual vs predicted salaries
comparison_df = pd.DataFrame({
    "Actual Salary": y_test.values,
    "Predicted Salary": y_pred
})

# Show first 10 results
print(comparison_df.head(10))


NameError: name 'y_pred' is not defined

In [38]:
# Filter out rows with zero salary
df = df[df["salary_in_usd"] > 0]


In [39]:
# Target and feature split
y = df["salary_in_usd"]
X = df.drop(columns=["salary_in_usd"])

# One-hot encode categorical features
X = pd.get_dummies(X, drop_first=True)

# Split into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [40]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [41]:
# Show updated Actual vs Predicted
pd.DataFrame({
    "Actual Salary": y_test.values,
    "Predicted Salary": y_pred
}).head(10)


Unnamed: 0,Actual Salary,Predicted Salary
0,48585.0,65132.145057
1,104650.0,175384.148751
2,130900.0,143784.442058
3,205000.0,179605.857958
4,156400.0,138910.434683
5,192564.0,185979.613572
6,79600.0,149271.211554
7,204500.0,176883.498391
8,180180.0,148482.687458
9,210000.0,194701.330637


In [42]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R² Score:", r2_score(y_test, y_pred))


MAE: 35270.67590949201
RMSE: 46236.91937174553
R² Score: 0.45998577247543415


In [43]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


In [44]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rf)))
print("R² Score:", r2_score(y_test, y_pred_rf))


MAE: 16632.350543197463
RMSE: 31583.22366842242
R² Score: 0.7480349333611551


In [45]:
results = pd.DataFrame({
    "Actual Salary": y_test.values,
    "Predicted Salary": y_pred
})

print(results.head(10))  # Show first 10 rows


   Actual Salary  Predicted Salary
0        48585.0      65132.145057
1       104650.0     175384.148751
2       130900.0     143784.442058
3       205000.0     179605.857958
4       156400.0     138910.434683
5       192564.0     185979.613572
6        79600.0     149271.211554
7       204500.0     176883.498391
8       180180.0     148482.687458
9       210000.0     194701.330637
