In [None]:
%pip install augini

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import json
import os
import sys
sys.path.append('..')
from augini import Augini

# Input your api key
def get_api_key():
    api_key = os.environ.get('OPENROUTER_TOKEN')
    if api_key:
        print("Using API key from environment variable.")
        return api_key
    else:
        api_key = input("Enter your API key manually: ")
        return api_key

# Set up Augini
api_key = get_api_key()
augini = Augini(api_key=api_key, use_openrouter=True, model='openai/gpt-4o-mini-2024-07-18')

Using API key from environment variable.


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Create a sample customer dataset
np.random.seed(42)
n_customers = 100

data = {
    'CustomerID': [f'C{i:04d}' for i in range(1, n_customers + 1)],
    'Age': np.random.randint(18, 80, n_customers),
    'Tenure': np.random.randint(0, 10, n_customers),
    'MonthlyCharges': np.random.uniform(20, 200, n_customers).round(2),
    'TotalCharges': np.random.uniform(100, 5000, n_customers).round(2),
    'Contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_customers),
    'PaymentMethod': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer', 'Credit card'], n_customers),
    'Churn': np.random.choice([0, 1], n_customers, p=[0.7, 0.3])  # 30% churn rate
}

df = pd.DataFrame(data)

print("Original Dataset:")
display(df.head())

Original Dataset:


Unnamed: 0,CustomerID,Age,Tenure,MonthlyCharges,TotalCharges,Contract,PaymentMethod,Churn
0,C0001,56,9,31.68,313.66,One year,Electronic check,0
1,C0002,69,3,65.7,4973.3,One year,Bank transfer,0
2,C0003,46,7,64.44,2402.73,Month-to-month,Credit card,0
3,C0004,32,6,145.33,1469.85,Two year,Electronic check,0
4,C0005,60,8,148.21,4429.12,Month-to-month,Bank transfer,1


# 1. Data Augmentation: Add synthetic features using Augini

In [3]:
augment_prompt = """
Based on the customer's age, tenure, monthly charges, total charges, contract type, and payment method, suggest:
1. A likely reason for churn (if applicable)
2. A personalized retention offer
3. The customer's estimated lifetime value (in dollars)

Respond with a JSON object with keys 'ChurnReason', 'RetentionOffer', and 'EstimatedLTV'.
"""

augmented_df = augini.augment_columns(df, ['ChurnReason', 'RetentionOffer', 'EstimatedLTV'], custom_prompt=augment_prompt)

print("\nAugmented Dataset:")
display(augmented_df.head())


Augmented Dataset:


Unnamed: 0,CustomerID,Age,Tenure,MonthlyCharges,TotalCharges,Contract,PaymentMethod,Churn,ChurnReason,RetentionOffer,EstimatedLTV
0,C0001,56,9,31.68,313.66,One year,Electronic check,0,N/A - Customer is currently not at risk of chu...,Offer a loyalty discount of 10% on monthly cha...,380.0
1,C0002,69,3,65.7,4973.3,One year,Bank transfer,0,N/A - The customer has not churned.,Provide a loyalty discount of 15% on monthly c...,1979.2
2,C0003,46,7,64.44,2402.73,Month-to-month,Credit card,0,Not applicable since the customer is currently...,Offer a loyalty discount of 15% for the next 6...,3802.73
3,C0004,32,6,145.33,1469.85,Two year,Electronic check,0,Low satisfaction with service quality due to h...,Offer a promotional rate of $120/month for the...,1200.0
4,C0005,60,8,148.21,4429.12,Month-to-month,Bank transfer,1,High monthly charges and a month-to-month cont...,Offer a discounted rate of $120 per month for ...,6205.44


In [4]:
# After augmenting the dataset
print("\nExample Augmented Data:")
for i in range(3):
    customer = augmented_df.iloc[i]
    print(f"\nCustomer ID: {customer['CustomerID']}")
    print(f"Age: {customer['Age']}, Tenure: {customer['Tenure']}, Monthly Charges: ${customer['MonthlyCharges']:.2f}")
    print(f"Contract: {customer['Contract']}, Churn: {'Yes' if customer['Churn'] else 'No'}")
    print(f"Churn Reason: {customer['ChurnReason']}")
    print(f"Retention Offer: {customer['RetentionOffer']}")
    print(f"Estimated LTV: ${customer['EstimatedLTV']:.2f}")


Example Augmented Data:

Customer ID: C0001
Age: 56, Tenure: 9, Monthly Charges: $31.68
Contract: One year, Churn: No
Churn Reason: N/A - Customer is currently not at risk of churning.
Retention Offer: Offer a loyalty discount of 10% on monthly charges for the next 6 months.
Estimated LTV: $380.00

Customer ID: C0002
Age: 69, Tenure: 3, Monthly Charges: $65.70
Contract: One year, Churn: No
Churn Reason: N/A - The customer has not churned.
Retention Offer: Provide a loyalty discount of 15% on monthly charges for the next 6 months to encourage continued engagement.
Estimated LTV: $1979.20

Customer ID: C0003
Age: 46, Tenure: 7, Monthly Charges: $64.44
Contract: Month-to-month, Churn: No
Churn Reason: Not applicable since the customer is currently retained and has a moderate tenure.
Retention Offer: Offer a loyalty discount of 15% for the next 6 months for staying on a month-to-month contract.
Estimated LTV: $3802.73


Key Observations:

- The **augini** generates personalized retention offers based on each customer's specific situation, considering factors like age, contract type, and perceived churn risk.
- Estimated LTV varies significantly, highlighting the importance of tailored retention strategies for high-value customers.
- The **augini** identifies potential churn reasons even for non-churned customers, allowing for proactive retention efforts.
- The retention offers are designed to address the specific concerns or risk factors identified for each customer.

# 2. Data Analysis and Visualization

In [None]:
plt.figure(figsize=(16, 12))

# Churn rate by contract type
plt.subplot(2, 2, 1)
sns.countplot(data=df, x='Contract', hue='Churn')
plt.title('Churn Rate by Contract Type')

# Monthly charges distribution for churned vs non-churned customers
plt.subplot(2, 2, 2)
sns.histplot(data=df, x='MonthlyCharges', hue='Churn', kde=True)
plt.title('Monthly Charges Distribution')

# Tenure vs Churn
plt.subplot(2, 2, 3)
sns.boxplot(data=df, x='Churn', y='Tenure')
plt.title('Tenure vs Churn')

# Correlation heatmap
plt.subplot(2, 2, 4)
corr_matrix = df[['Age', 'Tenure', 'MonthlyCharges', 'TotalCharges', 'Churn']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')

plt.tight_layout()
plt.show()

# 3. Churn Prediction Model

In [None]:
features = ['Age', 'Tenure', 'MonthlyCharges', 'TotalCharges']
X = df[features]
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("\nChurn Prediction Model Performance:")
print(classification_report(y_test, y_pred))

# 4. Feature Importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance for Churn Prediction')
plt.show()



# 5. Insights Generation using Augini

In [8]:
insights_prompt = """
Based on the customer churn analysis, provide three key insights about customer behavior, churn patterns, and potential retention strategies.
Consider factors such as contract types, payment methods, tenure, and charges.

Respond with a JSON object with keys 'Insight1', 'Insight2', and 'Insight3'.
"""

# Use augment_single with the entire DataFrame
insights = augini.augment_single(augmented_df.head(4), 'Insights', custom_prompt=insights_prompt)

In [None]:
insights.head()

In [None]:
print("Insights from the first 3 rows:")
for i in range(3):
    insight = insights['Insights'].iloc[i]
    print(f"\nRow {i + 1}:")
    if isinstance(insight, dict):
        for key, value in insight.items():
            print(f"{key}: {value}")
    else:
        print("Unexpected data type. Raw insight:")
        print(insight)
    print("-" * 50)  # Separator between rows

# 6. Retention Strategy Recommendations

In [11]:
strategy_prompt = """
Based on the churn analysis and generated insights, provide three strategic recommendations to reduce customer churn and improve retention.
Consider personalized offers, pricing strategies, contract adjustments, and customer engagement initiatives.

Respond with a JSON object with keys 'Strategy1', 'Strategy2', 'Strategy3'.
"""

strategies = augini.augment_single(augmented_df.head(4), 'Strategies', custom_prompt=strategy_prompt)

In [None]:
# Assuming your DataFrame is named 'insights'
print("Strategies from the first 3 rows:")
for i in range(3):
    insight = strategies['Strategies'].iloc[i]
    print(f"\nRow {i + 1}:")
    if isinstance(insight, dict):
        for key, value in insight.items():
            print(f"{key}: {value}")
    else:
        print("Unexpected data type. Raw insight:")
        print(insight)
    print("-" * 50)  # Separator between rows

# Summary of Retention Strategies

## Strategy 1: Personalized Offers and Loyalty Programs

- **Long-term Customers**: Implement a personalized loyalty discount program, offering up to 15% off monthly charges for customers with 9+ years tenure.
- **Age-based Targeting**: Tailor offers based on age demographics, particularly for senior customers.
- **Contract Commitment**: Encourage month-to-month customers to switch to longer-term plans with incentives like discounted rates (e.g., $54.44/month for 6 months on a 12-month contract).

## Strategy 2: Flexible and Tiered Pricing

- **Usage-based Plans**: Introduce flexible pricing options that align with different usage patterns and customer needs.
- **Senior-friendly Options**: Develop tiered pricing that caters specifically to senior citizens' budgets and service requirements.
- **Competitive Alignment**: Analyze the market to ensure prices reflect perceived value and introduce loyalty rewards for long-term customers.

## Strategy 3: Enhanced Customer Engagement

- **Proactive Communication**: Establish regular check-ins with customers to address concerns and offer personalized solutions.
- **Feedback Mechanism**: Implement quarterly customer satisfaction surveys to gather insights and preemptively address issues.
- **Relationship Building**: Foster stronger connections through ongoing engagement initiatives and responsive customer service.

These strategies focus on personalizing offers, creating flexible pricing structures, and improving customer engagement to reduce churn and enhance overall customer satisfaction.

## Chat Examples

In [5]:
augini.chat("What are the summary statistics?", augmented_df)

'The summary statistics of the DataFrame are as follows: Age: count = 100.0, mean = 50.27, std = 19.18, min = 19.0, 25% = 34.75, 50% = 51.5, 75% = 68.0, max = 79.0; Tenure: count = 100.0, mean = 4.98, std = 2.92, min = 0.0, 25% = 2.0, 50% = 6.0, 75% = 7.25, max = 9.0; MonthlyCharges: count = 100.0, mean = 119.22, std = 49.41, min = 23.25, 25% = 77.87, 50% = 117.56, 75% = 161.95, max = 199.59; TotalCharges: count = 100.0, mean = 2253.31, std = 1454.11, min = 125.41, 25% = 1027.41, 50% = 2039.12, 75% = 3413.11, max = 4973.3; Churn: count = 100.0, mean = 0.36, std = 0.48, min = 0.0, 25% = 0.0, 50% = 0.0, 75% = 1.0, max = 1.0; EstimatedLTV: count = 100.0, mean = 4392.04, std = 4264.07, min = 207.3, 25% = 1230.42, 50% = 2512.58, 75% = 6269.04, max = 19644.0.'

In [6]:
augini.chat("What are the column names?", augmented_df)

"The column names in the DataFrame are: ['CustomerID', 'Age', 'Tenure', 'MonthlyCharges', 'TotalCharges', 'Contract', 'PaymentMethod', 'Churn', 'ChurnReason', 'RetentionOffer', 'EstimatedLTV']."

In [7]:
augini.chat("Give me the summary statistics.", augmented_df)

'The summary statistics for the DataFrame are as follows: Age: count = 100.0, mean = 50.27, std = 19.18, min = 19.0, 25% = 34.75, 50% = 51.5, 75% = 68.0, max = 79.0; Tenure: count = 100.0, mean = 4.98, std = 2.92, min = 0.0, 25% = 2.0, 50% = 6.0, 75% = 7.25, max = 9.0; MonthlyCharges: count = 100.0, mean = 119.22, std = 49.41, min = 23.25, 25% = 77.87, 50% = 117.56, 75% = 161.95, max = 199.59; TotalCharges: count = 100.0, mean = 2253.31, std = 1454.11, min = 125.41, 25% = 1027.41, 50% = 2039.12, 75% = 3413.11, max = 4973.3; Churn: count = 100.0, mean = 0.36, std = 0.48, min = 0.0, 25% = 0.0, 50% = 0.0, 75% = 1.0, max = 1.0; EstimatedLTV: count = 100.0, mean = 4392.04, std = 4264.07, min = 207.3, 25% = 1230.42, 50% = 2512.58, 75% = 6269.04, max = 19644.0.'

In [8]:
augini.conversation_history

[{'query': 'What are the summary statistics?',
  'response': 'The summary statistics of the DataFrame are as follows: Age: count = 100.0, mean = 50.27, std = 19.18, min = 19.0, 25% = 34.75, 50% = 51.5, 75% = 68.0, max = 79.0; Tenure: count = 100.0, mean = 4.98, std = 2.92, min = 0.0, 25% = 2.0, 50% = 6.0, 75% = 7.25, max = 9.0; MonthlyCharges: count = 100.0, mean = 119.22, std = 49.41, min = 23.25, 25% = 77.87, 50% = 117.56, 75% = 161.95, max = 199.59; TotalCharges: count = 100.0, mean = 2253.31, std = 1454.11, min = 125.41, 25% = 1027.41, 50% = 2039.12, 75% = 3413.11, max = 4973.3; Churn: count = 100.0, mean = 0.36, std = 0.48, min = 0.0, 25% = 0.0, 50% = 0.0, 75% = 1.0, max = 1.0; EstimatedLTV: count = 100.0, mean = 4392.04, std = 4264.07, min = 207.3, 25% = 1230.42, 50% = 2512.58, 75% = 6269.04, max = 19644.0.',
  'df_context': {'columns': ['CustomerID',
    'Age',
    'Tenure',
    'MonthlyCharges',
    'TotalCharges',
    'Contract',
    'PaymentMethod',
    'Churn',
    'ChurnRea