In [None]:
import pandas as pd
import re

# Sample data
df = pd.DataFrame({
    'raw_date': ['Mon, 28th May 2020 12:34:56 GMT', 'Tue, 1st Jan 2019 05:00:00 GMT']
})

# Remove day suffixes like 'st', 'nd', 'rd', 'th' using regex
df['cleaned_date'] = df['raw_date'].apply(lambda x: re.sub(r'(\d+)(st|nd|rd|th)', r'\1', x))

# Parse into datetime format
df['parsed_date'] = pd.to_datetime(df['cleaned_date'], format='%a, %d %b %Y %H:%M:%S %Z', errors='coerce')

# Optional: just keep the date part (no time)
df['just_date'] = df['parsed_date'].dt.date

print(df[['raw_date', 'just_date']])


In [None]:
import matplotlib.pyplot as plt

# Sample data
years = ['2010', '2025']
car1_prices = [15000, 18000]
car2_prices = [20000, 25000]
car3_prices = [17000, 22000]

# Plot
fig, ax = plt.subplots()

# Stacked bars
bar1 = ax.bar(years, car1_prices, label='Car 1')
bar2 = ax.bar(years, car2_prices, bottom=car1_prices, label='Car 2')

# Compute bottoms for car 3 (stacked on car1 + car2)
car1_plus_car2 = [c1 + c2 for c1, c2 in zip(car1_prices, car2_prices)]
bar3 = ax.bar(years, car3_prices, bottom=car1_plus_car2, label='Car 3')

# Add labels and legend
ax.set_ylabel('Total Price ($)')
ax.set_title('Stacked Car Prices in 2010 vs 2025')
ax.legend()

plt.tight_layout()
plt.show()


In [None]:
import shap
import seaborn as sns
import matplotlib.pyplot as plt

# Get SHAP interaction values
explainer = shap.TreeExplainer(model)
shap_interaction_values = explainer.shap_interaction_values(X)  # shape: [n_samples, n_features, n_features]

# Define feature of interest
feature_name = 'A'
feature_index = list(X.columns).index(feature_name)

# Interaction values of 'A' with every other feature (averaged across samples)
interaction_strengths = np.abs(shap_interaction_values[:, feature_index, :]).mean(axis=0)

# Create DataFrame for plotting
interaction_df = pd.DataFrame({
    'feature': X.columns,
    'interaction_strength': interaction_strengths
}).sort_values(by='interaction_strength', ascending=False)

# Optional: Drop self-interaction if you only want cross-feature effects
interaction_df = interaction_df[interaction_df.feature != feature_name]

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(data=interaction_df, x='interaction_strength', y='feature')
plt.title(f"Mean SHAP Interaction Strengths with '{feature_name}'")
plt.xlabel("Mean |SHAP Interaction Value|")
plt.ylabel("Interacting Feature")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
CMAKE_ARGS="-DLLAMA_CUDA=OFF -DLLAMA_METAL=OFF -DLLAMA_CLBLAST=OFF -DLLAMA_BLAS=OFF" \
pip install --force-reinstall --no-cache-dir --no-binary llama-cpp-python llama-cpp-python

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore

# --- Step 1: Simulate weekly purchase data ---
np.random.seed(0)
weeks = pd.date_range("2023-01-01", periods=52, freq='W')
states = ['Texas', 'Oklahoma', 'Louisiana', 'California', 'New York']

def make_trend(base, noise=5):
    return base + np.random.normal(0, noise, size=len(weeks))

data = {
    'Texas': make_trend(np.linspace(100, 200, 52)),
    'Oklahoma': make_trend(np.linspace(90, 180, 52)),
    'Louisiana': make_trend(np.linspace(105, 205, 52)),
    'California': make_trend(np.linspace(300, 400, 52)),
    'New York': make_trend(np.linspace(200, 100, 52)),
}

df = pd.DataFrame(data, index=weeks)

# --- Step 2: Normalize trends (Z-score) ---
df_normalized = df.apply(zscore)

# --- Step 3: Hyper-tune K using silhouette score ---
X = df_normalized.T  # states as rows
sil_scores = {}
for k in range(2, min(len(X), 10)):
    model = KMeans(n_clusters=k, random_state=0)
    labels = model.fit_predict(X)
    score = silhouette_score(X, labels)
    sil_scores[k] = score

best_k = max(sil_scores, key=sil_scores.get)
print(f"Best number of clusters: {best_k} with silhouette score: {sil_scores[best_k]:.3f}")

# --- Step 4: Fit final model and visualize ---
final_model = KMeans(n_clusters=best_k, random_state=0)
labels = final_model.fit_predict(X)
cluster_df = pd.DataFrame({'state': X.index, 'cluster': labels})
print(cluster_df)

# --- Plot silhouette scores ---
plt.figure(figsize=(8, 5))
plt.plot(list(sil_scores.keys()), list(sil_scores.values()), marker='o')
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Score vs. Number of Clusters")
plt.grid(True)
plt.xticks(range(2, min(len(X), 10)))
plt.axvline(x=best_k, linestyle='--', color='red', label=f'Best k = {best_k}')
plt.legend()
plt.tight_layout()
plt.show()


# --- Step 5: Plot each cluster's normalized time trends ---
for cluster in cluster_df['cluster'].unique():
    cluster_states = cluster_df[cluster_df['cluster'] == cluster]['state']
    df_normalized[cluster_states].plot(title=f"Cluster {cluster} Trends")
    plt.ylabel("Z-Score Normalized Purchases")
    plt.show()

# --- Step 6: Correlation heatmap ---
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Raw Correlation of Purchase Trends")
plt.show()

# --- Step 7: Intra-cluster correlation check ---
def mean_cluster_corr(corr_matrix, cluster_df):
    results = {}
    for cluster in cluster_df['cluster'].unique():
        members = cluster_df[cluster_df['cluster'] == cluster]['state']
        intra = corr_matrix.loc[members, members].values
        intra_corr = intra[np.triu_indices_from(intra, k=1)]
        results[f"Cluster {cluster} mean corr"] = np.mean(intra_corr)
    return results

print(mean_cluster_corr(corr, cluster_df))


In [None]:
# Convert dates to datetime if they aren't already
df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date']   = pd.to_datetime(df['end_date'])

# 1️⃣ Recode: 1 if end_date is after start_date, else 0
df['date_after_flag'] = (df['end_date'] > df['start_date']).astype(int)

# 2️⃣ Recode: 1 if 'comments' contains 'hi' (case-insensitive), else 0
df['contains_hi_flag'] = df['comments'].str.contains('hi', case=False, na=False).astype(int)

# Sort so earliest date comes first
df = df.sort_values(by=['email', 'date'])

# Drop duplicates, keeping first occurrence (earliest date)
df_first_date = df.drop_duplicates(subset='email', keep='first')

# 1️⃣ Boolean masks
mask_saiid = df['campaign'].str.contains('saiid', case=False, na=False)
mask_not_saiid = ~mask_saiid

# 2️⃣ Sets of emails
emails_saiid = set(df.loc[mask_saiid, 'email'])
emails_not_saiid = set(df.loc[mask_not_saiid, 'email'])

# 3️⃣ Intersection and differences
emails_both = emails_saiid & emails_not_saiid
emails_only_saiid = emails_saiid - emails_both
emails_only_not_saiid = emails_not_saiid - emails_both

# 4️⃣ Counts
print("Count in both:", len(emails_both))
print("Count only in saiid:", len(emails_only_saiid))
print("Count only in non-saiid:", len(emails_only_not_saiid))

# Conditional logic
df['final_flag'] = (
    ((df['one_column'] == 1) & (df['min_date'] > df['application_date'])) |
    ((df['one_column'] == 0) & (df['application_date'] > fixed_date))
).astype(int)

In [None]:
campaign_substr = 'hi'  # the substring to search for

# 1) Which rows are in a matching campaign?
mask = df['campaign'].astype(str).str.contains(campaign_substr, case=False, na=False)

# 2) All emails that EVER had a matching campaign
emails_with_match = set(df.loc[mask, 'email'])

# 3) Flag every row for those emails (1 if the email ever used the campaign, else 0)
df['ever_hi'] = df['email'].isin(emails_with_match).astype(int)

# 4) First date each email was associated with a matching campaign
first_hi_per_email = (
    df.loc[mask]
      .groupby('email', as_index=False)['date']
      .min()
      .set_index('email')['date']
)

# Map to every row; emails without a match get NaT
df['first_hi_date'] = df['email'].map(first_hi_per_email)

# (optional) If you'd rather fill non-matching emails with a sentinel string:
# df['first_hi_date'] = df['first_hi_date'].dt.strftime('%Y-%m-%d').fillna('never')

In [None]:
# Group by score and calculate average sales rate
avg_sales = df.groupby("score")["actual"].mean().reset_index()

# Plot
plt.figure(figsize=(7,5))
plt.plot(avg_sales["score"], avg_sales["actual"], marker="o", linewidth=2)

# Style
plt.title("Average Sales Rate by Model Score", fontsize=14)
plt.xlabel("Model Score (1 = high sale likelihood, 5 = low)", fontsize=12)
plt.ylabel("Average Sales Rate", fontsize=12)
plt.grid(True, alpha=0.3)
plt.xticks(avg_sales["score"])

# Annotate values
for x, y in zip(avg_sales["score"], avg_sales["actual"]):
    plt.text(x, y + 0.01, f"{y:.2%}", ha="center", fontsize=10)


In [None]:
target_prefixes = ("1", "2", "3")  # numbers you want to match at the start

for json_file in json_files:
    try:
        obj = s3.get_object(Bucket=bucket_name, Key=json_file)
        file_content = obj['Body'].read().decode('utf-8')

        try:
            json_data = json.loads(file_content)
        except json.JSONDecodeError:
            print(f"File {json_file} is not valid JSON. Skipping.")
            continue

        if isinstance(json_data, dict):
            json_data = [json_data]
        elif not isinstance(json_data, list):
            print(f"Unexpected JSON format in {json_file}. Skipping.")
            continue

        # Filter JSON objects by key values starting with target numbers
        filtered_data = [
            item for item in json_data
            if any(str(v).startswith(target_prefixes) for v in item.values())
        ]

        all_data.extend(filtered_data)

    except Exception as e:
        print(f"Error processing file {json_file}: {e}")
        continue


In [None]:
import boto3
import ijson
import orjson
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from botocore.config import Config
from io import BytesIO

# ==== CONFIG ====
bucket_name = 'your-bucket-name'
prefix = 'your-folder-name/'      # keep trailing slash
target_prefixes = ("1", "2", "3") # numbers to match at the start
field_name = None                 # e.g., "id" to only check that field; None = check any field
max_workers = 16                  # tune based on your env / network
max_pool_connections = 64         # bigger HTTP conn pool for concurrency
# ===============

cfg = Config(max_pool_connections=max_pool_connections, retries={'max_attempts': 10, 'mode': 'standard'})
s3 = boto3.client('s3', config=cfg)

# List keys with pagination
paginator = s3.get_paginator('list_objects_v2')
page_it = paginator.paginate(Bucket=bucket_name, Prefix=prefix)

json_keys = []
for page in page_it:
    for obj in page.get('Contents', []):
        key = obj['Key']
        if key.endswith('.json'):
            json_keys.append(key)

def obj_matches(item: dict) -> bool:
    """Return True if the item has a value starting with target_prefixes."""
    def starts(v):
        return isinstance(v, (str, bytes)) and str(v).startswith(target_prefixes)
    if field_name:
        v = item.get(field_name, None)
        return starts(v)
    else:
        # check any value
        for v in item.values():
            # also handle nested simple types in small dicts/lists quickly
            if starts(v):
                return True
            if isinstance(v, (list, tuple)):
                if any(starts(x) for x in v):
                    return True
            if isinstance(v, dict):
                if any(starts(x) for x in v.values()):
                    return True
        return False

def process_key(key: str):
    """Download and stream-filter one S3 object. Returns a list of matching dicts."""
    try:
        obj = s3.get_object(Bucket=bucket_name, Key=key)
        body = obj['Body'].read()  # read once; we’ll stream from memory to ijson/lines
        bio = BytesIO(body)

        results = []

        # Try streaming as a JSON array of objects: ijson.items(..., 'item') yields each element
        try:
            for item in ijson.items(bio, 'item'):  # resets at file start; if not array, will raise
                if isinstance(item, dict) and obj_matches(item):
                    results.append(item)
            return results
        except ijson.common.IncompleteJSONError:
            # Not a clean array; try JSON Lines path below
            pass
        except ijson.backends.yajl2.common.JSONError:
            pass
        except Exception:
            # Fallback to lines below
            pass

        # Fallback: treat as JSON Lines (one JSON object per line)
        # Reset buffer to start
        bio.seek(0)
        for line in bio:
            line = line.strip()
            if not line:
                continue
            try:
                item = orjson.loads(line)
                if isinstance(item, dict) and obj_matches(item):
                    results.append(item)
            except orjson.JSONDecodeError:
                # Last resort: maybe whole file is a single dict (not lines)
                # Try once, only if we're at the first non-empty line (avoid repeated work)
                pass

        if results:
            return results

        # Final fallback: whole-file parse (small files)
        try:
            whole = orjson.loads(body)
            if isinstance(whole, dict):
                whole = [whole]
            if isinstance(whole, list):
                return [it for it in whole if isinstance(it, dict) and obj_matches(it)]
        except orjson.JSONDecodeError:
            return []

        return []
    except Exception as e:
        print(f"Error processing {key}: {e}")
        return []

# Run in parallel
filtered_chunks = []
with ThreadPoolExecutor(max_workers=max_workers) as ex:
    futures = [ex.submit(process_key, k) for k in json_keys]
    for fut in as_completed(futures):
        chunk = fut.result()
        if chunk:
            filtered_chunks.append(pd.DataFrame.from_records(chunk, index=None))

# Concatenate lazily to avoid huge memory spikes
if filtered_chunks:
    df = pd.concat(filtered_chunks, ignore_index=True)
    print("Filtered DataFrame created:", df.shape)
    print(df.head())
else:
    df = pd.DataFrame()
    print("No matching records found.")
