In [17]:
import os
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import requests
import pandas as pd
import numpy as np
from scipy import stats
import pandas as pd
from isort import file
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import ipywidgets as widgets
from IPython.display import display, HTML
import ast
import json
import numpy as np
import pandas as pd
from scipy import stats
import numpy as np
from scipy import stats
import json
import ipywidgets as widgets
from IPython.display import display, HTML

In [18]:


def load_data_from_url(url, filename):
    """
    Downloads data from the given URL and saves it to the specified file.

    Args:
        url (str): The URL of the data to be downloaded.
        filename (str): The name of the file to save the downloaded data.

    Returns:
        None
    """
    if not os.path.exists(filename):
        response = requests.get(url)
        with open(filename, 'wb') as f:
            f.write(response.content)

def extract_data(filename):
    """
    Extracts relevant columns from a CSV file containing job data.

    Parameters:
    filename (str): The path to the CSV file.

    Returns:
    pandas.DataFrame: A DataFrame containing the filtered job data.
    """
    jobs_data = pd.read_csv(filename)
    columns_of_interest = [
        "job_title_short", "job_location", "job_via", "job_schedule_type",
        "job_work_from_home", "job_posted_date", "job_skills", "job_country", "search_location", "company_name",
        "job_title", 'salary_year_avg', 'job_no_degree_mention', 'job_health_insurance','job_type_skills'
    ]
    jobs_data_filtered = jobs_data[columns_of_interest]
    return jobs_data_filtered

def preprocess_data(jobs_data):
    """
    Preprocesses the job data by filling missing values, converting data types, creating new columns,
    and encoding categorical variables.

    Args:
        jobs_data (pandas.DataFrame): The input job data.

    Returns:
        pandas.DataFrame: The preprocessed job data.
    """
    jobs_data['job_work_from_home'] = jobs_data['job_work_from_home'].fillna(True)
    jobs_data['salary_year_avg'] = jobs_data['salary_year_avg'].fillna(0)
    jobs_data['job_posted_date'] = pd.to_datetime(jobs_data['job_posted_date'], errors='coerce').dt.date
    jobs_data['job_title'] = jobs_data['job_title'].fillna('').astype(str)
    jobs_data['job_title_short'] = jobs_data['job_title_short'].fillna('').astype(str)

    # Define experience levels and their corresponding keywords
    experience_levels = {
        "Entry": ["entry level", "junior", "intern"],
        "Mid": ["mid level", "mid-level", "associate"],
        "Senior": ["senior", "lead", "principal", "manager"],
        "Director": ["director", "head"],
        "Executive": ["executive", "vp", "vice president", "cxo", "ceo", "cto", "cfo"]
    }

    def get_experience_level(title):
        """
        Determines the experience level based on the job title.

        Args:
            title (str): The job title.

        Returns:
            str: The experience level.
        """
        for level, keywords in experience_levels.items():
            for keyword in keywords:
                if keyword.lower() in title.lower():
                    return level
        return "Entry"

    jobs_data['experience_level'] = jobs_data['job_title'].apply(get_experience_level)

    exp_level_abbr = {
        "Entry": "E",
        "Mid": "M",
        "Senior": "S",
        "Director": "D",
        "Executive": "X"
    }

    def create_abbreviated_job_title(row):
        """
        Creates an abbreviated job title based on the experience level.

        Args:
            row (pandas.Series): The row of the job data.

        Returns:
            str: The abbreviated job title.
        """
        exp_level = row['experience_level']
        if exp_level in exp_level_abbr:
            return f"{row['job_title_short']} ({exp_level_abbr[exp_level]})"
        return row['job_title_short']

    jobs_data['abbreviated_job_title'] = jobs_data.apply(create_abbreviated_job_title, axis=1)

    text_columns = ['job_title_short', 'job_location', 'job_via', 'job_schedule_type',
                    'job_work_from_home', 'job_country', 'search_location', 'company_name', 'job_title']

    for col in text_columns:
        if jobs_data[col].dtype == 'object':
            jobs_data[col] = jobs_data[col].str.lower().fillna('')
        else:
            jobs_data[col] = jobs_data[col].astype(str).str.lower().fillna('')

    jobs_data['job_skills'] = jobs_data['job_skills'].apply(lambda x: json.loads(x.replace("'", '"')) if pd.notnull(x) else [])
    jobs_data['job_type_skills'] = jobs_data['job_type_skills'].apply(lambda x: json.loads(x.replace("'", '"')) if pd.notnull(x) else {})

    mlb = MultiLabelBinarizer()
    job_skills_encoded = mlb.fit_transform(jobs_data['job_skills'])
    job_skills_df = pd.DataFrame(job_skills_encoded, columns=mlb.classes_)
    jobs_data = pd.concat([jobs_data, job_skills_df], axis=1)

    skill_types = jobs_data['job_type_skills'].apply(lambda x: list(x.keys()))
    unique_types = list(set([item for sublist in skill_types for item in sublist]))
    for skill_type in unique_types:
        jobs_data[skill_type] = jobs_data['job_type_skills'].apply(lambda x: ','.join(x[skill_type]) if skill_type in x else '')

    for skill_type in unique_types:
        mlb = MultiLabelBinarizer()
        skills_encoded = mlb.fit_transform(jobs_data[skill_type].apply(lambda x: x.split(',')))
        skills_df = pd.DataFrame(skills_encoded, columns=[f"{skill_type}_{skill}" for skill in mlb.classes_])
        jobs_data = pd.concat([jobs_data, skills_df], axis=1)
        jobs_data = jobs_data.drop(skill_type, axis=1)

    return jobs_data

In [19]:
def generate_report(report, json_filename, html_filename):
    """
    Generate a report in JSON and HTML format based on the given data.

    Args:
        report (dict): The data for generating the report.
        json_filename (str): The filename for the JSON report.
        html_filename (str): The filename for the HTML report.
    """
    # Generate JSON report
    with open(json_filename, 'w') as f:
        json.dump(report, f, indent=4)

    def dict_to_html_table(data, title, sort_and_limit=False):
        def nested_dict_to_table(nested_data):
            """
            Converts a nested dictionary into an HTML table.

            Args:
                nested_data (dict): The nested dictionary to be converted.

            Returns:
                str: The HTML table representation of the nested dictionary.
            """
            if isinstance(nested_data, dict):
                headers = "".join([f"<th>{key}</th>" for key in nested_data.keys()])
                values = "".join([f"<td>{value}</td>" for value in nested_data.values()])
                return f"<table><thead><tr>{headers}</tr></thead><tbody><tr>{values}</tr></tbody></table>"
            else:
                return str(nested_data)

        if isinstance(data, dict):
            if sort_and_limit:
                # Sort the dictionary by value in descending order and take top 10
                sorted_data = dict(sorted(data.items(), key=lambda x: x[1], reverse=True)[:10])
            else:
                sorted_data = data
            
            headers = "<th>Key</th><th>Value</th>"
            rows = "".join([f"<tr><td>{key}</td><td>{nested_dict_to_table(value)}</td></tr>" for key, value in sorted_data.items()])
        elif isinstance(data, list):
            headers = "<th>Index</th><th>Value</th>"
            rows = "".join([f"<tr><td>{index}</td><td>{nested_dict_to_table(value)}</td></tr>" for index, value in enumerate(data)])
        else:
            return f"<p>{data}</p>"

        return f"""
        <div class="table-container">
            <h2>{title}</h2>
            <div class="table-wrapper">
                <table>
                    <thead><tr>{headers}</tr></thead>
                    <tbody>{rows}</tbody>
                </table>
            </div>
        </div>
        """

    # Generate HTML report
    html_content = f"""
    <html>
    <head>
        <title>Job Data Analysis Report</title>
        <style>
            body {{
                font-family: Arial, sans-serif;
                line-height: 1.6;
                padding: 20px;
                max-width: 1200px;
                margin: 0 auto;
            }}
            h1 {{ color: #333; }}
            h2 {{ color: #666; }}
            .table-container {{
                margin-bottom: 30px;
            }}
            .table-wrapper {{
                overflow-x: auto;
                max-height: 500px;
                overflow-y: auto;
            }}
            table {{
                border-collapse: collapse;
                width: 100%;
            }}
            th, td {{
                border: 1px solid #ddd;
                padding: 8px;
                text-align: left;
            }}
            thead {{
                position: sticky;
                top: 0;
                background-color: #f2f2f2;
            }}
            th {{
                background-color: #f2f2f2;
            }}
            tr:nth-child(even) {{
                background-color: #f9f9f9;
            }}
        </style>
    </head>
    <body>
        <h1>Job Data Analysis Report</h1>
        {dict_to_html_table(report['Descriptive Statistics'], 'Descriptive Statistics')}
        {dict_to_html_table(report['Average Salary by Location'], 'Average Salary by Location (Top 10)', sort_and_limit=True)}
        {dict_to_html_table(report['Job Counts by Company'], 'Job Counts by Company (Top 10)', sort_and_limit=True)}
        {dict_to_html_table(report['Experience Level Analysis'], 'Experience Level Analysis')}
    </body>
    </html>
    """
    with open(html_filename, 'w') as f:
        f.write(html_content)

In [20]:
def json_serializable(obj):
    """
    Helper function to make an object JSON serializable.

    Args:
        obj: The object to be made JSON serializable.

    Returns:
        The JSON serializable representation of the object.
    """
    if isinstance(obj, (np.int64, np.int32, np.int16, np.int8)):
        return int(obj)
    elif isinstance(obj, (np.float64, np.float32, np.float16)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, pd.Series):
        return obj.to_dict()
    elif isinstance(obj, pd.DataFrame):
        return obj.to_dict(orient='records')
    else:
        return str(obj)

def make_hashable(item):
    """
    Convert unhashable types within a list or dict to hashable types.

    Args:
        item: The item to be converted.

    Returns:
        The hashable representation of the item.
    """
    if isinstance(item, list):
        return tuple(make_hashable(x) for x in item)
    elif isinstance(item, dict):
        return frozenset((key, make_hashable(value)) for key, value in item.items())
    return item

def perform_data_quality_checks(df):
    """
    Perform automated data quality checks on the dataframe.

    Parameters:
    - df: pandas.DataFrame
        The input dataframe to perform data quality checks on.

    Returns:
    - dict
        A dictionary containing the results of the data quality checks. The dictionary has the following structure:
        {
            "missing_values": {},
            "data_types": {},
            "outliers": {},
            "inconsistencies": {},
            "value_ranges": {}
        }
        - "missing_values": dict
            A dictionary where the keys are column names and the values are the number of missing values in each column.
        - "data_types": dict
            A dictionary where the keys are column names and the values are the data types of each column.
        - "outliers": dict
            A dictionary where the keys are column names and the values are the number of outliers in each numerical column.
        - "inconsistencies": dict
            A dictionary where the keys are column names and the values are the inconsistencies found in each categorical column.
        - "value_ranges": dict
            A dictionary where the keys are column names and the values are dictionaries containing the minimum, maximum, mean, and median values of each numerical column.
    """
    report = {
        "missing_values": {},
        "data_types": {},
        "outliers": {},
        "inconsistencies": {},
        "value_ranges": {}
    }

    # Preprocess columns containing unhashable types
    for col in df.columns:
        if df[col].dtype == 'object':
            # Convert lists and dicts to hashable types
            df[col] = df[col].apply(lambda x: make_hashable(x) if isinstance(x, (list, dict)) else x)
    
    # Check for missing values
    missing_values = df.isnull().sum()
    report["missing_values"] = json_serializable(missing_values[missing_values > 0])

    # Check data types
    report["data_types"] = json_serializable(df.dtypes.astype(str))

    # Check for outliers in numerical columns
    numerical_columns = df.select_dtypes(include=[np.number]).columns
    for col in numerical_columns:
        z_scores = np.abs(stats.zscore(df[col].dropna()))
        outliers = df[col][z_scores > 3]
        if not outliers.empty:
            report["outliers"][col] = len(outliers)

    # Check for inconsistencies in categorical columns
    for col in df.columns:
        if df[col].dtype == 'object':
            unique_values = df[col].dropna().unique()
            unique_count = len(unique_values)
            
            if unique_count == 1:
                report["inconsistencies"][col] = f"Only one unique value: {unique_values[0]}"
            elif unique_count > 100:
                report["inconsistencies"][col] = f"High cardinality: {unique_count} unique values"

    # Check value ranges for numerical columns
    for col in numerical_columns:
        report["value_ranges"][col] = {
            "min": json_serializable(df[col].min()),
            "max": json_serializable(df[col].max()),
            "mean": json_serializable(df[col].mean()),
            "median": json_serializable(df[col].median())
        }

    return report

def generate_data_quality_report(report, filename):
    """
    Generate an HTML report for data quality checks with simple table formatting.
    """
    def dict_to_html_table(data):
        html = "<table border='1'><tr><th>Key</th><th>Value</th></tr>"
        for key, value in data.items():
            html += f"<tr><td>{key}</td><td>{value}</td></tr>"
        html += "</table>"
        return html

    html_content = f"""
    <html>
    <head>
        <title>Data Quality Report</title>
        <style>
            body {{ font-family: Arial, sans-serif; line-height: 1.6; padding: 20px; }}
            h1 {{ color: #333; }}
            h2 {{ color: #666; }}
            table {{ border-collapse: collapse; width: 100%; }}
            th, td {{ text-align: left; padding: 8px; }}
            tr:nth-child(even) {{ background-color: #f2f2f2; }}
        </style>
    </head>
    <body>
        <h1>Data Quality Report</h1>
        <h2>Missing Values</h2>
        {dict_to_html_table(report['missing_values'])}
        <h2>Data Types</h2>
        {dict_to_html_table(report['data_types'])}
        <h2>Outliers</h2>
        {dict_to_html_table(report['outliers'])}
        <h2>Inconsistencies</h2>
        {dict_to_html_table(report['inconsistencies'])}
        <h2>Value Ranges</h2>
        {dict_to_html_table({k: str(v) for k, v in report['value_ranges'].items()})}
    </body>
    </html>
    """
    with open(filename, 'w') as f:
        f.write(html_content)

def create_data_quality_report_widget(jobs_data, results_folder):
    """
    Creates a widget for generating a data quality report.

    Parameters:
    - jobs_data (DataFrame): The input data for performing data quality checks.
    - results_folder (str): The folder path where the data quality report will be saved.

    Returns:
    - widget.VBox: The widget containing a button to generate the data quality report and an output area for displaying the report.
    """
    output = widgets.Output()

    def on_generate_report(b):
        with output:
            output.clear_output()
            print("Generating Data Quality Report...")
            data_quality_report = perform_data_quality_checks(jobs_data)
            filename_quality_report = os.path.join(results_folder, 'data_quality_report.html')
            generate_data_quality_report(data_quality_report, filename_quality_report)
            print("Data quality report generated as data_quality_report.html")
            display(HTML('<a href="data_quality_report.html" target="_blank">Open Data Quality Report</a>'))

    generate_button = widgets.Button(description="Generate Data Quality Report")
    generate_button.on_click(on_generate_report)

    return widgets.VBox([generate_button, output])


In [21]:
def parse_skills(skills_string):
    """
    Parses a string or list of skills and returns a list of stripped skills.

    Args:
        skills_string (str or list): The input string or list of skills.

    Returns:
        list: A list of stripped skills.

    Raises:
        None

    Examples:
        >>> parse_skills('Python, Java, C++')
        ['Python', 'Java', 'C++']

        >>> parse_skills(['Python', 'Java', 'C++'])
        ['Python', 'Java', 'C++']
    """
    # If the input is already a list, return it as is after stripping elements
    if isinstance(skills_string, list):
        return [str(skill).strip() for skill in skills_string if skill]
    
    try:
        # Try to parse the string as a list using literal_eval
        skills_list = ast.literal_eval(skills_string)
        return [str(skill).strip() for skill in skills_list if skill]
    except (ValueError, SyntaxError):
        # If parsing fails, treat it as a comma-separated string
        return [skill.strip() for skill in skills_string.split(',') if skill.strip()]

def job_recommendation_system(jobs_data, user_skills):
    """
    Recommends jobs based on user's skills.

    Args:
        jobs_data (DataFrame): A DataFrame containing job data.
        user_skills (list): A list of skills possessed by the user.

    Returns:
        DataFrame: A DataFrame containing the top 10 recommended jobs based on skill match.
    """
    jobs_data['skill_match'] = jobs_data['job_skills'].apply(lambda x: len(set(x) & set(user_skills)))
    recommended_jobs = jobs_data.sort_values('skill_match', ascending=False).head(10)
    return recommended_jobs

def perform_comprehensive_analysis(jobs_data):
    """
    Perform a comprehensive analysis on job market data.

    Args:
        jobs_data (DataFrame): The input job market data.

    Returns:
        tuple: A tuple containing the analysis report and the trained model.
            The analysis report is a dictionary with the following keys:
                - 'Descriptive Statistics': Descriptive statistics of the job market data.
                - 'Average Salary by Location': Average salary by job location.
                - 'Job Counts by Company': Number of jobs by company.
                - 'Feature Importance': Feature importance of the trained model.
                - 'Experience Level Analysis': Analysis of job data by experience level.
            The trained model is a RandomForestRegressor model trained on the input data.
    """
    # Function code here...
    pass
def perform_comprehensive_analysis(jobs_data):
    # Ensure job_skills are parsed
    jobs_data['job_skills'] = jobs_data['job_skills'].apply(parse_skills)

    # Descriptive statistics
    desc_stats = jobs_data.describe()
    avg_salary_by_location = jobs_data.groupby('job_location')['salary_year_avg'].mean()
    job_counts_by_company = jobs_data['company_name'].value_counts()

    features = ['job_location', 'job_schedule_type', 'job_work_from_home', 'job_country', 'job_posted_date', 'experience_level']
    X = jobs_data[features]
    y = jobs_data['salary_year_avg']

    label_encoders = {}
    for col in X.select_dtypes(include=['object', 'datetime64']).columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    mae = mean_absolute_error(y_test, predictions)
    # Feature Importance
    feature_importance = pd.DataFrame({
        'feature': features,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)

    # Experience Level-Based Job Analysis
    exp_level_analysis = jobs_data.groupby('experience_level').agg({
        'salary_year_avg': ['mean', 'std'],
        'job_title': 'count'
    }).reset_index()
    exp_level_analysis.columns = ['experience_level', 'avg_salary', 'salary_std', 'job_count']

    report = {
        'Descriptive Statistics': desc_stats.to_dict(),
        'Average Salary by Location': avg_salary_by_location.to_dict(),
        'Job Counts by Company': job_counts_by_company.to_dict(),
        'Feature Importance': feature_importance.to_dict('records'),
        'Experience Level Analysis': exp_level_analysis.to_dict('records')
    }

    return report, model

def create_interactive_widgets(jobs_data, model):
    """
    Creates interactive widgets for job recommendation and salary prediction.

    Args:
        jobs_data (pandas.DataFrame): DataFrame containing job data.
        model: Trained machine learning model for salary prediction.

    Returns:
        output (widgets.Output): Output widget to display recommendations and predictions.
    """
    # Job recommendation widgets
    all_skills = set()
    for skills in jobs_data['job_skills']:
        all_skills.update(skills)
    all_skills = {skill for skill in all_skills if isinstance(skill, str) and skill.isalpha() and len(skill) > 1}

    skills_dropdown = widgets.SelectMultiple(
        options=sorted(all_skills),
        description='Skills:',
        disabled=False
    )

    recommend_button = widgets.Button(description="Get Recommendations")

    # Experience level dropdown for salary prediction
    experience_levels = jobs_data['experience_level'].unique()
    exp_level_dropdown = widgets.Dropdown(
        options=experience_levels,
        description='Experience Level:',
        disabled=False
    )

    predict_salary_button = widgets.Button(description="Predict Salary")

    output = widgets.Output()

    def on_recommend_clicked(b):
        with output:
            output.clear_output()
            user_skills = list(skills_dropdown.value)
            if not user_skills:
                print("Please select at least one skill.")
                return

            recommended_jobs = job_recommendation_system(jobs_data, user_skills)
            display(HTML(recommended_jobs[['job_title_short', 'job_location', 'job_via']].to_html(index=False)))

    def on_predict_salary_clicked(b):
        with output:
            output.clear_output()
            exp_level = exp_level_dropdown.value
            if not exp_level:
                print("Please select an experience level.")
                return

            # Get average values for other features
            avg_features = jobs_data[['job_location', 'job_schedule_type', 'job_work_from_home', 'job_country']].mode().iloc[0]

            # Prepare the input for prediction
            input_data = pd.DataFrame({
                'job_location': [avg_features['job_location']],
                'job_schedule_type': [avg_features['job_schedule_type']],
                'job_work_from_home': [avg_features['job_work_from_home']],
                'job_country': [avg_features['job_country']],
                'job_posted_date': [jobs_data['job_posted_date'].max()],
                'experience_level': [exp_level]
            })

            # Transform categorical variables
            for col in input_data.select_dtypes(include='object').columns:
                le = LabelEncoder()
                le.fit(jobs_data[col])  # Fit on the entire dataset
                input_data[col] = le.transform(input_data[col])

            # Make prediction
            predicted_salary = model.predict(input_data)[0]
            
            # Calculate confidence interval (assuming normal distribution)
            confidence_interval = 1.96 * 7053.83  # 95% confidence interval

            lower_bound = max(0, predicted_salary - confidence_interval)
            upper_bound = predicted_salary + confidence_interval

            print(f"Predicted salary for {exp_level} experience level:")
            print(f"${predicted_salary:.2f} (95% CI: ${lower_bound:.2f} - ${upper_bound:.2f})")

            # Show average salary from the dataset for comparison
            avg_salary = jobs_data[jobs_data['experience_level'] == exp_level]['salary_year_avg'].mean()
            print(f"Average salary in dataset for {exp_level} experience level: ${avg_salary:.2f}")

    recommend_button.on_click(on_recommend_clicked)
    predict_salary_button.on_click(on_predict_salary_clicked)

    display(widgets.VBox([
        widgets.HBox([skills_dropdown, recommend_button]),
        widgets.HBox([exp_level_dropdown, predict_salary_button]),
        output
    ]))

    return output

def display_full_report(report, output):
    """
    Display the full job data analysis report.

    Args:
        report (dict): The job data analysis report containing various sections.
        output (IPython.display.OutputWidget): The output widget to display the report.

    Returns:
        None
    """
    with output:
        output.clear_output()
        
        html_content = """
        <h1>Job Data Analysis Report</h1>
        
        <h2>Descriptive Statistics</h2>
        {desc_stats}
        
        <h2>Average Salary by Location (Top 10)</h2>
        {avg_salary}
        
        <h2>Job Counts by Company (Top 10)</h2>
        {job_counts}
        
        <h2>Feature Importance</h2>
        {feature_importance}
        
        <h2>Experience Level Analysis</h2>
        {exp_analysis}
        """
        
        # Convert each section to HTML
        desc_stats_html = pd.DataFrame(report['Descriptive Statistics']).to_html()
        avg_salary_html = pd.Series(report['Average Salary by Location']).sort_values(ascending=False).head(10).to_frame().to_html()
        job_counts_html = pd.Series(report['Job Counts by Company']).sort_values(ascending=False).head(10).to_frame().to_html()
        feature_importance_html = pd.DataFrame(report['Feature Importance']).to_html()
        exp_analysis_html = pd.DataFrame(report['Experience Level Analysis']).to_html()
        
        # Format the HTML content
        formatted_html = html_content.format(
            desc_stats=desc_stats_html,
            avg_salary=avg_salary_html,
            job_counts=job_counts_html,
            feature_importance=feature_importance_html,
            exp_analysis=exp_analysis_html
        )
        
        display(HTML(formatted_html))

In [23]:
# Main execution block
if __name__ == "__main__":
    results_folder = "results"
    if not os.path.exists(results_folder):
        os.makedirs(results_folder)
    # Load and preprocess data (you need to implement this part)
    url = "https://huggingface.co/datasets/lukebarousse/data_jobs/resolve/main/data_jobs.csv?download=true"
        # Filepath to save the downloaded data
    filename = os.path.join(results_folder, "data_jobs.csv")

    # Check if data file exists, if not, download it
    if not os.path.exists(filename):
        print(f"Downloading data from {url}...")
        print(f"Saving data to {filename}...")
        load_data_from_url(url, filename)

    # Check if preprocessed data file exists
    preprocessed_filename = os.path.join(results_folder, "data_jobs_preprocessed_analysis.csv")
    if os.path.exists(preprocessed_filename):
        print(f"Preprocessed data found as {preprocessed_filename}. Loading...")
        jobs_data_preprocessed = pd.read_csv(preprocessed_filename)
    else:
        print(preprocessed_filename)
        # Extract and preprocess data
        jobs_data_filtered = extract_data(filename)
        jobs_data_preprocessed = preprocess_data(jobs_data_filtered)
        jobs_data_preprocessed.to_csv(preprocessed_filename, index=False)
        print(f"Preprocessed data saved as {preprocessed_filename}")

    jobs_data_sampled = jobs_data_preprocessed.sample(n=20000, random_state=42)
    # Perform data quality checks
    data_quality_widget = create_data_quality_report_widget(jobs_data_sampled, results_folder)
    display(data_quality_widget)
    report, model = perform_comprehensive_analysis(jobs_data_sampled)
    print("Interactive job analysis system is ready. Please use the dropdowns to select skills and experience level.")
    # Create and display interactive widgets
    output = create_interactive_widgets(jobs_data_sampled, model)
    # Create and display the full report button
    full_report_button = widgets.Button(description="Show Full Report")
    full_report_button.on_click(lambda b: display_full_report(report, output))
    display(full_report_button)
    json_filename = os.path.join(results_folder,'job_data_analysis_report.json')
    html_filename = os.path.join(results_folder,'job_data_analysis_report.html')
    generate_report(report, json_filename, html_filename)

Preprocessed data found as results/data_jobs_preprocessed_analysis.csv. Loading...


VBox(children=(Button(description='Generate Data Quality Report', style=ButtonStyle()), Output()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])


Interactive job analysis system is ready. Please use the dropdowns to select skills and experience level.


VBox(children=(HBox(children=(SelectMultiple(description='Skills:', options=('airflow', 'airtable', 'alteryx',…

Button(description='Show Full Report', style=ButtonStyle())