In [1]:
import pandas as pd

df = pd.read_csv('dfSet.csv')

## Exploratory Data Analysis

## Feature Engineering

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

text_columns = ['description', 'requirements', 'company_profile', 'benefits']
tfidf_vectorizers = {}
tfidf_features = []

for column in text_columns:
    if column in df.columns:
        tfidf = TfidfVectorizer(max_features=100)  # Limit to 100 features per column for simplicity
        tfidf_matrix = tfidf.fit_transform(df[column].fillna(''))  # Fill NaN with empty strings
        tfidf_features.append(pd.DataFrame(tfidf_matrix.toarray(), columns=[f"{column}_tfidf_{i}" for i in range(100)]))
        tfidf_vectorizers[column] = tfidf

tfidf_features_df = pd.concat(tfidf_features, axis=1)

categorical_columns = ['location', 'department', 'employment_type', 'required_experience', 'required_education', 'industry', 'function']
encoded_features = []

for column in categorical_columns:
    if column in df.columns:
        le = LabelEncoder()
        encoded_column = le.fit_transform(df[column].fillna('Unknown'))  # Fill NaN with 'Unknown'
        encoded_features.append(pd.DataFrame(encoded_column, columns=[f"{column}_encoded"]))

encoded_features_df = pd.concat(encoded_features, axis=1)

binary_columns = ['telecommuting', 'has_company_logo', 'has_questions', 'fraudulent', 'in_balanced_dataset']
for column in binary_columns:
    if column in df.columns:
        df[column] = df[column].apply(lambda x: 1 if x == 't' else 0)  # Convert 't' to 1 and 'f' to 0

final_features = pd.concat([tfidf_features_df, encoded_features_df, df[binary_columns]], axis=1)

unused_columns = ['title', 'salary_range', 'company_profile', 'description', 'requirements', 'benefits']
data = df.drop(columns=unused_columns, errors='ignore')

prepared_dataset = pd.concat([data, final_features], axis=1)
