In [5]:
import sqlite3
import pandas as pd
import numpy as np

In [6]:
# Create a connection to SQLite database (or create a new one)
conn = sqlite3.connect('transactional_data.db')
cursor = conn.cursor()

# Create a transactional table
cursor.execute('''
CREATE TABLE IF NOT EXISTS transactions (
    transaction_id INTEGER PRIMARY KEY,
    transaction_date TEXT,
    product_id INTEGER,
    quantity INTEGER,
    price REAL
)
''')

# Insert some sample data into the transactional table
np.random.seed(42)
transaction_data = [
    (i, 
     f"2024-08-{np.random.randint(1, 31):02d}", 
     np.random.randint(1, 11), 
     np.random.randint(1, 10), 
     round(np.random.uniform(5.0, 100.0), 2)
    )
    for i in range(1, 101)
]

cursor.executemany('''
INSERT INTO transactions (transaction_id, transaction_date, product_id, quantity, price)
VALUES (?, ?, ?, ?, ?)
''', transaction_data)

conn.commit()

print("SQLite Transactional table created and populated with sample data.")

IntegrityError: UNIQUE constraint failed: transactions.transaction_id

In [None]:
# Create a DataFrame with product_id and product_category
product_category_data = {
    'product_id': range(1, 11),
    'product_category': ['Electronics', 'Clothing', 'Groceries', 'Furniture', 'Toys', 
                         'Books', 'Sports', 'Beauty', 'Automotive', 'Jewelry']
}

product_category_df = pd.DataFrame(product_category_data)

# Save the DataFrame to a CSV file
product_category_df.to_csv('product_categories.csv', index=False)

print("CSV file with product categories created.")

CSV file with product categories created.


In [9]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Set random seed for reproducibility
np.random.seed(42)

# Create a synthetic dataset with 10,000 rows and 50 features
X, y = make_classification(n_samples=10000, 
                           n_features=50, 
                           n_informative=10, 
                           n_redundant=10, 
                           n_repeated=5, 
                           n_classes=2, 
                           n_clusters_per_class=2, 
                           weights=[0.5, 0.5],
                           flip_y=0.01,
                           class_sep=1.0,
                           random_state=42)

# Convert to DataFrame
columns = [f'feature_{i}' for i in range(50)]
df = pd.DataFrame(X, columns=columns)

# Introduce some correlated features
df['feature_corr_1'] = df['feature_0'] + df['feature_1'] + np.random.normal(0, 0.1, size=df.shape[0])
df['feature_corr_2'] = df['feature_2'] - df['feature_3'] + np.random.normal(0, 0.1, size=df.shape[0])

# Add the target variable to the DataFrame
df['target'] = y

# Save the dataset as a CSV file
df.to_csv('synthetic_dataset.csv', index=False)

print("Dataset created and saved as 'synthetic_dataset.csv'.")

Dataset created and saved as 'synthetic_dataset.csv'.


In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
n_samples = 1000

# Generate a normally distributed feature
feature_1 = np.random.normal(loc=50, scale=10, size=n_samples)

# Generate 3 features that are correlated with feature_1
feature_2 = 0.5 * feature_1 + np.random.normal(loc=0, scale=5, size=n_samples)  # Correlated positively
feature_3 = 2.0 * feature_1 + np.random.normal(loc=0, scale=10, size=n_samples)  # More strongly correlated
feature_4 = -1.0 * feature_1 + np.random.normal(loc=0, scale=5, size=n_samples)  # Negatively correlated

# Introduce missing values in feature_1 (approximately 20% missing)
missing_indices = np.random.choice(n_samples, size=int(n_samples * 0.2), replace=False)
feature_1[missing_indices] = np.nan

# Create a DataFrame
df = pd.DataFrame({
    'feature_1': feature_1,
    'feature_2': feature_2,
    'feature_3': feature_3,
    'feature_4': feature_4
})

# Display the first few rows of the DataFrame
print("First 5 rows of the dataset with missing values in 'feature_1':")
print(df.head())

# Save the dataset to a CSV file
df.to_csv('pre-ml_dataset.csv', index=False)

print("\nDataset created and saved as 'ml_dataset.csv'.")

First 5 rows of the dataset with missing values in 'feature_1':
   feature_1  feature_2   feature_3  feature_4
0  54.967142  34.480348  103.182500 -64.506179
1  48.617357  28.931847   95.789527 -52.919282
2  56.476885  28.536595  105.029572 -58.544913
3  65.230299  29.380465  127.380982 -55.791860
4  47.658466  27.320350   76.380786 -44.875701

Dataset created and saved as 'ml_dataset.csv'.
