## Step 1: Installing and importing the libraries

In [None]:
!pip install google-cloud-storage>=2.2.1
!pip install transformers
!pip install bitsandbytes
!pip install peft
!pip install trl
!pip install accelerate
!pip install datasets
!pip install mlflow==2.15.0

In [None]:
import torch
import transformers
# from trl import SFTTrainer
from peft import (LoraConfig, get_peft_model, prepare_model_for_kbit_training)
from datasets import load_dataset
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline)
import pandas as pd
from datasets import load_dataset, Dataset


# Step 2: Preparing the data


In [None]:
!ls /kaggle/input/

In [None]:
# Load the financial dataset
file_path = '/kaggle/input/ValorizationData.xlsx'  # Update with the correct path if different
df = pd.read_excel(file_path)

# Display the first few rows of the dataframe
# print(df.head())

# Preprocessing: Handle missing values and prepare inputs and targets
df.fillna('', inplace=True)

# Translate column names if not done already
df.rename(columns={
    'Saphyr Reference': 'Référence Saphyr',
    'Year': 'Année',
    'Month': 'Mois',
    'Forecast_M-1 (Cost)': 'Prévision_M-1 (Coût)',
    'Valorization (Cost)': 'Valorisation (Coût)',
    'Forecast_M-1 (Days)': 'Prévision_M-1 (Jours)',
    'Valorization (Days)': 'Valorisation (Jours)',
    'Forecast_M-1 (Margin%)': 'Prévision_M-1 (Marge%)',
    'Valorization (Margin%)': 'Valorisation (Marge%)',
    'Forecast_M-1 (Turnover)': 'Prévision_M-1 (Chiffre d’affaires)',
    'Valorization (Turnover)': 'Valorisation (Chiffre d’affaires)',
    'ValorizationComment': 'Commentaire de Valorisation'
}, inplace=True)

df = df[df['Commentaire de Valorisation'].str.len() >= 30]
df.head(5)


In [None]:
# Create the text field in French
df['text'] = df.apply(lambda row: (
    f"""Projet {row['Référence Saphyr']} en {row['Année']}-{row['Mois']}
    avec des coûts prévisionnels de {row['Prévision_M-1 (Coût)']}, des coûts de valorisation de {row['Valorisation (Coût)']},
    des jours prévisionnels de {row['Prévision_M-1 (Jours)']}, des jours de valorisation de {row['Valorisation (Jours)']},
    une marge prévisionnelle de {row['Prévision_M-1 (Marge%)']}, une marge de valorisation de {row['Valorisation (Marge%)']},
    un chiffre d'affaires prévisionnel de {row['Prévision_M-1 (Chiffre d’affaires)']},
    un chiffre d'affaires de valorisation de {row['Valorisation (Chiffre d’affaires)']}."""
), axis=1)

# Display the first few rows to verify the changes
# Set display option to show full text in columns
pd.set_option('display.max_colwidth', None)
print(df[['Référence Saphyr', 'text']].head())

In [None]:
# Define input and target columns
inputs = df['text'].tolist()
targets = df['Commentaire de Valorisation'].tolist()

# Create a dataset
data = {
    'text': inputs,
    'target_text': targets
}

dataset = Dataset.from_pandas(pd.DataFrame(data))

# Split the dataset into training and validation sets
# train_test_split = dataset.train_test_split(test_size=0.1)
# train_dataset = train_test_split['train']
# val_dataset = train_test_split['test']

# train_dataset = dataset

In [None]:
dataset

In [None]:
print(dataset[0])  # Inspect the first sample


In [None]:
# Define the response template for the solutions
response_template = "### Analyse financière du projet:"

def create_text_field(sample):
  return {
      "text": f"{sample['text']}\n{response_template} {sample['target_text']}"
    }

dataset = dataset.map(create_text_field, remove_columns=dataset.features, batched=False)

print(dataset[0])
print(len(dataset))

In [None]:
dataset[0]

# Step 3: Login to Hugging Face

In [None]:
# Login into our HF account using our token
from huggingface_hub import login
# from google.colab import userdata
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HF_TOKEN")

login(
  token=secret_value_0, # Retrieve my HF_TOKEN stored in Google Colab Secrets
  # add_to_git_credential=True
)

# Step 4: Loading the model

In [None]:
# BitsAndBytesConfig for 4-bit integers
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=getattr(torch, "float16")
)

pretrained_model_name = "instruction-pretrain/finance-Llama3-8B"
# aboonaji/llama2finetune-v2
llama_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path = pretrained_model_name,
                                                   quantization_config = bnb_config)

llama_model.config.use_cache = False
# llama_model.config.pretraining_tp = 1

# Step 5: Loading the tokenizer

In [None]:

# Load and configure the tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name,
    use_fast=True,
    trust_remote_code=True
)

# Set special tokens
if llama_tokenizer.pad_token is None:
    llama_tokenizer.pad_token = llama_tokenizer.eos_token

if llama_tokenizer.bos_token is None:
    llama_tokenizer.bos_token = llama_tokenizer.cls_token

if llama_tokenizer.sep_token is None:
    llama_tokenizer.sep_token = llama_tokenizer.cls_token

llama_tokenizer.padding_side = "right"



In [None]:
llama_tokenizer.padding_side = "right"

# Verify the configuration
print(f"PAD token: {llama_tokenizer.pad_token}")
print(f"EOS token: {llama_tokenizer.eos_token}")
print(f"BOS token: {llama_tokenizer.bos_token}")
print(f"SEP token: {llama_tokenizer.sep_token}")

# Test tokenization
test_text = "Voici un texte à tester."
tokens = llama_tokenizer(test_text)
print(tokens)
decoded_text = llama_tokenizer.decode(tokens['input_ids'])
print(decoded_text)

# Step 6: Setting the training arguments

In [None]:

import mlflow
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datetime import datetime
# LoRA config based on QLoRA paper & Sebastian Raschka experiment

peft_config = LoraConfig(
    lora_alpha=16,       # Moderate alpha for balanced scaling
    lora_dropout=0.05,   # Standard dropout to avoid overfitting
    r=64,                # Moderate rank for expressiveness
    bias="none",         # No bias adaptation
    task_type="CAUSAL_LM",
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj']  # Key projection layers for adaptation
)

# Define training arguments for lower memory usage
training_arguments = TrainingArguments(
    # Set this to mlflow for logging your training
    report_to="mlflow",
    output_dir="./results",
    # evaluation_strategy="steps",
    # do_eval=True,
    optim="adamw_torch",  # Use a simpler optimizer
    per_device_train_batch_size=1,  # Reduce batch size to fit memory constraints
    gradient_accumulation_steps=16,  # Increase accumulation steps to compensate for small batch size
    per_device_eval_batch_size=1,  # Reduce eval batch size
    log_level="info",
    save_steps=50,  # Save less frequently if needed
    logging_steps=25,  # Log less frequently to reduce overhead
    learning_rate=1e-4,  # Use a lower learning rate
    # eval_steps=100,  # Evaluate less frequently
    num_train_epochs=1,  # Reduce the number of epochs for quick testing
    max_steps=500,  # Limit the total number of steps for quick testing
    warmup_steps=50,  # Reduce warmup steps
    lr_scheduler_type="linear",
    fp16=True,  # Enable mixed-precision training
    # Name the MLflow run
    run_name=f"{pretrained_model_name}-QLoRA-{datetime.now().strftime('%Y-%m-%d-%H-%M-%s')}",
)


peft_model = get_peft_model(llama_model, peft_config)
peft_model.print_trainable_parameters()



# training_arguments = TrainingArguments(output_dir = "./results", per_device_train_batch_size = 4, max_steps = 100)

# Step 7: Tokenize the dataset

In [None]:
# Tokenize the dataset
def tokenize_function(sample):
    return llama_tokenizer(
        sample['text'],
        truncation=True,
        padding="max_length",  # Use "max_length" if consistent length is needed
        max_length=512  # Adjust based on your model's expected input size
    )

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
print(tokenized_dataset[0])

# Step 8: Creating the Fine-Tuning trainer

In [None]:
# llama_trainer = SFTTrainer(model = llama_model,
#                                args = training_arguments,
#                                train_dataset = tokenized_dataset,
#                                peft_config=peft_config,
#                                tokenizer = llama_tokenizer,
#                                dataset_text_field = "text")

llama_trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=tokenized_dataset,
    data_collator=transformers.DataCollatorForLanguageModeling(llama_tokenizer, mlm=False),
    args=training_arguments,
)

# use_cache=True is incompatible with gradient checkpointing.
peft_model.config.use_cache = False

# Step 9: Training the model

In [None]:
llama_trainer.train()

# Step 10: Save the model and the tokenizer

In [None]:
# Define the output directory
output_dir = "./saved_model"

# Save the model
# llama_sft_trainer.save_model(output_dir)

# Save the tokenizer
# llama_tokenizer.save_pretrained(output_dir)

## mlFlow save model

In [None]:
import mlflow

# Get the ID of the MLflow Run that was automatically created above
last_run_id = mlflow.last_active_run().info.run_id

# Save a tokenizer without padding because it is only needed for training
tokenizer_no_pad = AutoTokenizer.from_pretrained(llama_model, add_bos_token=True)

# If you interrupt the training, uncomment the following line to stop the MLflow run
# mlflow.end_run()

with mlflow.start_run(run_id=last_run_id):
    mlflow.log_params(peft_config.to_dict())
    mlflow.transformers.log_model(
        transformers_model={"model": llama_trainer.model, "tokenizer": llama_tokenizer},
#         prompt_template=prompt_template,
#         signature=signature,
        artifact_path="model",  # This is a relative path to save model files within MLflow run
    )


## mlFlow load model

## mlFlow Compare Metrics Model

# Step 11: Testing the model

In [None]:
# Define the improved prompt template
def create_financial_prompt(data):
    return (
        f"Analyse financière pour le projet {data['Référence Saphyr']} en {data['Année']}-{data['Mois']}:\n"
        f"- Coûts prévisionnels: {data['Prévision_M-1 (Coût)']}\n"
        f"- Coûts de valorisation: {data['Valorisation (Coût)']}\n"
        f"- Jours prévisionnels: {data['Prévision_M-1 (Jours)']}\n"
        f"- Jours de valorisation: {data['Valorisation (Jours)']}\n"
        f"- Marge prévisionnelle: {data['Prévision_M-1 (Marge%)']}\n"
        f"- Marge de valorisation: {data['Valorisation (Marge%)']}\n"
        f"- Chiffre d'affaires prévisionnel: {data['Prévision_M-1 (Chiffre d’affaires)']}\n"
        f"- Chiffre d'affaires de valorisation: {data['Valorisation (Chiffre d’affaires)']}\n\n"
        "Veuillez analyser les éléments suivants en français:\n"
        "1. Peux tu analyser la valorisation Mensuelle et me fournir des conseils\n"
        "2. Peux tu analyser la prévision mensuel et me fournir des conseils \n"
        "Merci de fournir une analyse détaillée en français."

    )

# Example financial data row (replace with the provided values)
example_data = {
    'Référence Saphyr': 'PB-00008',
    'Année': 2023,
    'Mois': 12,
    'Prévision_M-1 (Coût)': '-22,083.00 €',
    'Valorisation (Coût)': '-20,810.00 €',
    'Prévision_M-1 (Jours)': 86.94,
    'Valorisation (Jours)': 81.25,
    'Prévision_M-1 (Marge%)': '28.25 %',
    'Valorisation (Marge%)': '16.27 %',
    'Prévision_M-1 (Chiffre d’affaires)': '30,777.00 €',
    'Valorisation (Chiffre d’affaires)': '24,852.50 €'
}
# Create the improved prompt
financial_prompt = create_financial_prompt(example_data)


In [None]:
financial_prompt

In [None]:
from transformers import pipeline

# Initialize the text generation pipeline
text_generation_pipeline = pipeline(task="text-generation", model=llama_model, tokenizer=llama_tokenizer, max_length=700)

# Generate a response
model_answer = text_generation_pipeline(f"<s>[INST] {financial_prompt} [/INST]")

# Print the generated analysis
print(model_answer[0]['generated_text'])


In [None]:
# Install Gradio
!pip install gradio

import gradio as gr
import pandas as pd

In [None]:
# Load the financial dataset
file_path = '/kaggle/input/ValorizationData.xlsx'  # Update with the correct path if different
df = pd.read_excel(file_path)

# Translate column names if not done already
df.rename(columns={
    'Saphyr Reference': 'Référence Saphyr',
    'Year': 'Année',
    'Month': 'Mois',
    'Forecast_M-1 (Cost)': 'Prévision_M-1 (Coût)',
    'Valorization (Cost)': 'Valorisation (Coût)',
    'Forecast_M-1 (Days)': 'Prévision_M-1 (Jours)',
    'Valorization (Days)': 'Valorisation (Jours)',
    'Forecast_M-1 (Margin%)': 'Prévision_M-1 (Marge%)',
    'Valorization (Margin%)': 'Valorisation (Marge%)',
    'Forecast_M-1 (Turnover)': 'Prévision_M-1 (Chiffre d’affaires)',
    'Valorization (Turnover)': 'Valorisation (Chiffre d’affaires)',
    'ValorizationComment': 'Commentaire de Valorisation'
}, inplace=True)

# Initialize the text generation pipeline
text_generation_pipeline = pipeline(task="text-generation", model=llama_model, tokenizer=llama_tokenizer, max_length=500)

# Function to extract data based on project name
def extract_data(project_name):
    project_data = df[df['Référence Saphyr'] == project_name].iloc[0].to_dict()
    # Return the 8 fields as a list
    return [
        project_data['Année'], project_data['Mois'], project_data['Prévision_M-1 (Coût)'],
        project_data['Valorisation (Coût)'], project_data['Prévision_M-1 (Jours)'],
        project_data['Valorisation (Jours)'], project_data['Prévision_M-1 (Marge%)'],
        project_data['Valorisation (Marge%)'], project_data['Prévision_M-1 (Chiffre d’affaires)'],
        project_data['Valorisation (Chiffre d’affaires)']
    ]

# Function to create prompt and generate analysis
def generate_analysis(project_name):
    data = extract_data(project_name)
    financial_prompt = (
        f"Analyse financière pour le projet {project_name} en {data[0]}-{data[1]}:\n"
        f"- Coûts prévisionnels: {data[2]}\n"
        f"- Coûts de valorisation: {data[3]}\n"
        f"- Jours prévisionnels: {data[4]}\n"
        f"- Jours de valorisation: {data[5]}\n"
        f"- Marge prévisionnelle: {data[6]}\n"
        f"- Marge de valorisation: {data[7]}\n"
        f"- Chiffre d'affaires prévisionnel: {data[8]}\n"
        f"- Chiffre d'affaires de valorisation: {data[9]}\n\n"
        "Veuillez analyser les éléments suivants en français:\n"
        "1. Analysez la valorisation mensuelle et fournissez des conseils.\n"
        "2. Analysez la prévision mensuelle et fournissez des conseils.\n"
        "Merci de fournir une analyse détaillée en français."
    )
    response = text_generation_pipeline(f"<s>[INST] {financial_prompt} [/INST]")
    return response[0]['generated_text']

In [None]:
df

# Step 12: Create a Gradio interface

In [None]:
# Define Gradio interface
project_dropdown = gr.Dropdown(label="Sélectionnez le projet", choices=df['Référence Saphyr'].unique().tolist())
output_textbox = gr.Textbox(label="Analyse", lines=10)

def on_submit(project_name):
    return generate_analysis(project_name)

# Build the Gradio interface
app = gr.Interface(
    fn=on_submit,
    inputs=[project_dropdown,
            gr.Slider(label="Longueur maximale de la réponse", minimum=50, maximum=1000, value=200, step=10)  # Slider for max_length
            ],
    outputs=[output_textbox],
    title="Générateur d'Analyse Financière",
    description="Sélectionnez un projet et obtenez une analyse financière détaillée."
)

# Launch the app
app.launch()