In [9]:
from dotenv import load_dotenv
load_dotenv()

from huggingface_hub import InferenceClient
import os
import numpy as np
import pandas as pd
from sqlalchemy import create_engine, types, text, Engine
import time

from src import database_methods as dbm
from src import ai_agent_methods as aiam


In [10]:
client: InferenceClient = InferenceClient(
    api_key=os.environ["HF_TOKEN"],
)

try:
    
    completion = client.chat.completions.create(
        model="openai/gpt-oss-120b:groq", 
        messages=[
            {
                "role": "user",
                "content": "What is the capital of France?"
            }
        ],
    )

    print(f"Model answer (What is the capital of France?):\n{completion.choices[0].message.content}")
except Exception as e:
    print("Model unsuccessfully loaded")

Model answer (What is the capital of France?):
The capital of France is **Paris**.


## Due to the fact that the data did not contain customer e-mail data, e-mail addresses will be generated, especially for the project, from the prefix "client_", customer_id, and the suffix "@mail.com".

## This data will be generated in an SQL query along with other parameters that will be useful for the AI ​​Agent to send emails to customers who have been inactive for some time to active customers for whom a discount in the store will be provided, encouraging them to make further purchases.

In [11]:
try:
    engine: Engine = dbm.get_db_engine()
    print("DB Engine successfuly created")
except Exception as e:
    print(f"DB Engine creation error: {e}")

query: str = """
SELECT 
    customer_id, 
    SUM(quantity * price) AS monetary, 
    COUNT (DISTINCT order_id) AS frequency,
    ((SELECT MAX(date) FROM e_commerce_order_details) - MAX(date)) AS recency, 
    ROUND((SUM(quantity * price) / (COUNT (DISTINCT order_id))), 2) AS average_order_value, 
	CASE
		WHEN 
			((SELECT MAX(date) FROM e_commerce_order_details) - MAX(date)) >= 90
		THEN
			1
		ELSE
			0
	END AS churn, 
	CONCAT('customer_', customer_id,'@mail.com') AS email
FROM 
	e_commerce_order_details
GROUP BY 
    customer_id;
"""
df_rfm: pd.DataFrame = pd.read_sql(
    query, 
    engine
)

print(df_rfm.sample(5))
del query


DB Engine successfuly created
     customer_id  monetary  frequency  recency  average_order_value  churn  \
1114       13658  10373.48          5        9              2074.70      0   
2462       15382  40748.55          6       14              6791.43      0   
4484       17965  18415.04         14       37              1315.36      0   
1208       13775     11.53          1      142                11.53      1   
1975       14764   2232.72          1       46              2232.72      0   

                        email  
1114  customer_13658@mail.com  
2462  customer_15382@mail.com  
4484  customer_17965@mail.com  
1208  customer_13775@mail.com  
1975  customer_14764@mail.com  


In [12]:
threshold_monetary: np.float64 = np.round(df_rfm["monetary"].quantile(0.80), 2)
print(f"Monetary threshold:\n{threshold_monetary}", "\n")
threshold_frequency: np.float64 = np.round(df_rfm["frequency"].quantile(0.80), 2)
print(f"Frequency threshold:\n{threshold_frequency}", "\n")
threshold_aov: np.float64 = np.round(df_rfm["average_order_value"].quantile(0.80), 2)
print(f"Average order value threshold:\n{threshold_aov}", "\n")


Monetary threshold:
14660.86 

Frequency threshold:
5.0 

Average order value threshold:
3616.08 



In [13]:
conditions: list[pd.Series]= [
    (df_rfm["churn"] == 1),
    (
        (df_rfm["churn"] == 0) & (
            (df_rfm["monetary"] >= threshold_monetary)
            | 
            (df_rfm["frequency"] >= threshold_frequency)
            | 
            (df_rfm["average_order_value"] >= threshold_aov)
            )
    )
]
choice: list[str] = [
    "churn_recovery", 
    "vip_loyalty"
]

df_rfm["segmentation"] = np.select(conditions, choice, default="standard_promo")
print(df_rfm)
del threshold_monetary, threshold_frequency, threshold_aov, choice


     customer_id  monetary  frequency  recency  average_order_value  churn  \
0          12004   1509.60          1      227              1509.60      1   
1          12006     24.76          1      218                24.76      1   
2          12008   5689.57          1      276              5689.57      1   
3          12013     69.96          1      359                69.96      1   
4          12024    149.52          1      176               149.52      1   
...          ...       ...        ...      ...                  ...    ...   
4713       18280    623.26          1      277               623.26      1   
4714       18281    576.58          1      180               576.58      1   
4715       18282   1044.86          2        7               522.43      0   
4716       18283  12114.61         16        3               757.16      0   
4717       18287  18139.56          3       42              6046.52      0   

                        email    segmentation  
0     customer_

In [14]:
vip_loyalty_prompt: str = r"""
    You are a Customer Success Manager for a premium brand. 
    Write a short, exclusive appreciation email to our VIP customer. 
    Thank them for their loyalty. 
    Offer a special 20% discount on their next purchase with code: VIP20. 
    Tone: Professional, grateful, and exclusive. 
    Keep it under 100 words. Do not use hashtags.
"""
churn_recovery_prompt: str = r"""
    You are a warm and friendly Customer Support Specialist. 
    Write a 'We Miss You' email to a customer who hasn't purchased in a while. 
    Convince them to come back. 
    Offer a 15% welcome back discount with code: MISSYOU15. 
    Tone: Empathetic, warm, casual. 
    Keep it under 80 words."
"""

standard_promo_prompt: str = r"""
    You are an energetic Marketing Copywriter. 
    Write a catchy promotional email to an active customer. 
    Encourage them to check out our new arrivals. 
    Offer a 10% discount on the next order with code: HELLO10. 
    Tone: Exciting, direct, sales-oriented. 
    Keep it under 80 words.
"""

prompt: dict[str,str] = {
    "vip_loyalty": vip_loyalty_prompt, 
    "churn_recovery": churn_recovery_prompt, 
    "standard_promo": standard_promo_prompt
}



## Methodology: AI Content Generation & API Integration
This module implements an LLM-based (Large Language Model) agent to automate personalized marketing communication. Below are the key technical decisions and the rationale behind the chosen data flow architecture.

1. Simulation Mode ("Dry Run") & Data Safety
For safety reasons and development purposes, no actual emails are transmitted to customer addresses.

Process: Generated content is stored directly within the pandas.DataFrame as a new feature (email_draft).

Persistence: Upon completion, the enriched dataset is exported to a .csv file. This allows for a Human-in-the-Loop (HITL) approach, where drafts undergo Quality Assurance (QA) before any potential production deployment.

2. Sequential Execution Strategy (Single-Threaded)
The system utilizes the pandas.apply() method for row-by-row processing. While vectorization is typically preferred in Data Science for performance, sequential processing is the optimal architectural choice in this specific context.

I deliberately avoided Parallel Processing (e.g., joblib, multiprocessing) due to the constraints of the external API:

Traffic Control: Using a sequential loop within .apply() allows for precise pacing via time.sleep(). This ensures the script respects the API's concurrency limits and maintains stability throughout the ETL process.

I/O Bound Process: The bottleneck is not local CPU computation power, but Network Latency (waiting for the server to generate text).

3. Operational Scope & Sampling Strategy
To ensure high availability and model performance, the system utilizes the Groq AI inference provider (via Hugging Face integration).

Due to the strict usage quotas and rate limits associated with the free tier of this provider, the generation process is strictly limited to a representative sample of the first 50 customers (df_rfm.head(50)).

Rationale: This sample size is sufficient to validate the Proof of Concept (PoC) and demonstrate the efficacy of the personalization prompts without incurring overage costs or triggering API blocks.

Scalability: The architecture is designed to scale to the full dataset (N=4000+) instantly upon upgrading to a paid enterprise plan.

In [15]:
df_test: pd.DataFrame = df_rfm.sample(3)
try:
    df_test["email_draft"] = df_test.apply(aiam.generate_email, axis=1, args=(client, prompt))
    print("Email drafts successfully generated")
except Exception as e:
    print(f"Email drafts unsuccessfully generated. Error: {e}")

for i in range(3):
    print(df_test["email_draft"].iloc[i])
    print(50 * "-")
del df_test

Email drafts successfully generated
Dear Customer 16971,

We’ve just stocked fresh arrivals you’ll love—stylish, unique pieces ready to revamp your wardrobe.

Explore now and enjoy an exclusive 10% off your next order with code HELLO10. Hurry, the new collection won’t wait!

Tap the link below and treat yourself today.

Best regards,
The Wojciech Kiełbowicz & Co Team
--------------------------------------------------
Dear Customer 14650,

We’ve noticed it’s been a while since your last visit and we truly miss having you with us. Your satisfaction means a lot, and we’d love to welcome you back.

Enjoy a 15% welcome‑back discount on your next order with code MISSYOU15. Hope to see you soon!

Best regards,
The Wojciech Kiełbowicz & Co Team
--------------------------------------------------
Dear Customer 17912,

We’ve missed you at Wojciech Kiełbowicz & Co! It’s been a while since your last visit, and we hope everything’s been great for you.

To show our appreciation, here’s a 15% welcome‑

In [None]:
try:
    print("Email drafts are being generated...")
    df_rfm["email_draft"] = df_rfm.head(50).apply(aiam.generate_email, axis=1, args=(client,prompt))
    print("Email drafts successfully generated")
except Exception as e:
    print(f"Email drafts unsuccessfully generated. Error: {e}")

cols: list[str] = [
    "customer_id", 
    "email_draft"
]

df_rfm.head(50)[cols].to_csv("../data/processed/marketing_campaign_drafts.csv", index=False)
del cols

Email drafts are being generated...
