In [1]:
# Create the package structure and write files

import os

# Define the package name
package_name = "my_data_science_package"

# Create directories for the package and tests
os.makedirs(f"{package_name}/", exist_ok=True)
os.makedirs(f"tests/", exist_ok=True)

# Define module names and their content
modules = {
    "data_processing.py": """
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder

def summarize_dataframe(sdf):
    \"\"\"
    Summarizes the structure and content of a DataFrame.
    \"\"\"
    return pd.DataFrame({
        'Column': sdf.columns,
        'Non-Null Count': sdf.notnull().sum(),
        'Data Type': sdf.dtypes,
        'Unique Values': sdf.nunique(),
        'Missing Values': sdf.isnull().sum(),
        'Missing Values %': ((sdf.isnull().sum()*100)/len(sdf)).round(2)
    }).reset_index(drop=True)

def impute_missing_values(df):
    \"\"\"
    Imputes missing values using KNN and Iterative Imputer.
    \"\"\"
    df_imputed = df.copy()
    categorical_columns = ['loan_request_reason', 'applicant_job_type']
    label_encoders = {col: LabelEncoder() for col in categorical_columns}
    for col, le in label_encoders.items():
        df_imputed[col] = le.fit_transform(df_imputed[col].astype(str))
    knn_imputer = KNNImputer(n_neighbors=5)
    iterative_imputer = IterativeImputer(max_iter=10, random_state=42)
    return df_imputed
""",
    "data_visualization.py": """
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def histogram_boxplot(data, feature, figsize=(12, 7), kde=True, bins=None):
    \"\"\"
    Boxplot and histogram combined for a feature.
    \"\"\"
    f, (ax_box, ax_hist) = plt.subplots(
        nrows=2, sharex=True, gridspec_kw={"height_ratios": (0.25, 0.75)}, figsize=figsize
    )
    sns.boxplot(data=data, x=feature, ax=ax_box, showmeans=True, color="violet")
    sns.histplot(data=data, x=feature, kde=kde, ax=ax_hist, bins=bins, palette="winter")
    plt.show()

def plot_correlation_heatmap(data, title):
    \"\"\"
    Plots a correlation heatmap of the dataset.
    \"\"\"
    plt.figure(figsize=(12, 10))
    sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm")
    plt.title(title, fontsize=16)
    plt.show()
""",
    "model_evaluation.py": """
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def metrics_score(actual, predicted):
    \"\"\"
    Calculates and displays classification metrics and confusion matrix.
    \"\"\"
    print(classification_report(actual, predicted))
    cm = confusion_matrix(actual, predicted)
    plt.figure(figsize=(8, 5))
    sns.heatmap(cm, annot=True, fmt='.2f', xticklabels=['Not Converted', 'Converted'],
                yticklabels=['Not Converted', 'Converted'])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
""",
    "utils.py": """
import pandas as pd

def segregate_columns_by_dtype(df):
    \"\"\"
    Segregates DataFrame columns by their data types.
    \"\"\"
    return {dtype.name: df.select_dtypes(include=[dtype]).columns.tolist() for dtype in df.dtypes.unique()}
""",
}

# Write the module files
for module_name, content in modules.items():
    with open(f"{package_name}/{module_name}", "w") as module_file:
        module_file.write(content)

# Create an empty __init__.py to make it a package
with open(f"{package_name}/__init__.py", "w") as init_file:
    init_file.write("")

# Write a basic setup.py
setup_content = """
from setuptools import setup, find_packages

setup(
    name='my_data_science_package',
    version='0.1.0',
    description='A collection of data science tools.',
    author='Your Name',
    author_email='your.email@example.com',
    packages=find_packages(),
    install_requires=[
        'pandas>=1.0',
        'numpy>=1.20',
        'matplotlib>=3.0',
        'seaborn>=0.11',
        'scikit-learn>=0.24',
    ],
    classifiers=[
        'Programming Language :: Python :: 3',
        'License :: OSI Approved :: MIT License',
        'Operating System :: OS Independent',
    ],
    python_requires='>=3.7',
)
"""
with open("setup.py", "w") as setup_file:
    setup_file.write(setup_content)

# Write a README.md file
with open("README.md", "w") as readme_file:
    readme_file.write("# My Data Science Package\n\nA collection of utilities for data science workflows.")

# Create placeholder test files
test_files = [
    "test_data_processing.py",
    "test_data_visualization.py",
    "test_model_evaluation.py",
    "test_utils.py",
]
for test_file in test_files:
    with open(f"tests/{test_file}", "w") as tf:
        tf.write("# Write your tests here\n")

print(f"Package {package_name} structure created successfully.")


Package my_data_science_package structure created successfully.
