In [1]:
!pip -q install git+https://github.com/huggingface/transformers # need to install from github
!pip -q install datasets loralib sentencepiece 
!pip -q install bitsandbytes accelerate

In [2]:
import os
import re
import json
# import torch
import textwrap
import os.path as osp
from typing import Union
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# config
dataset_name = 'flowbird-april--koala-7b'
model_name = 'samwit/koala-7b'
# model_name = 'chavinlo/alpaca-native'


In [4]:
tokenizer = LlamaTokenizer.from_pretrained(model_name)

base_model = LlamaForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    device_map='auto',
)





Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/uns/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so
CUDA SETUP: Loading binary /home/uns/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Loading checkpoint shards: 100%|██████████| 14/14 [00:09<00:00,  1.41it/s]


In [5]:
!nvidia-smi

Thu Apr 20 21:18:39 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.04              Driver Version: 531.29       CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090         On | 00000000:01:00.0  On |                  N/A |
| 53%   45C    P2              116W / 350W|  10815MiB / 24576MiB |      6%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [15]:
%%time

# decent configs
pipe = pipeline(
    "text-generation",
    model=base_model, 
    tokenizer=tokenizer, 
    max_length=612,
    temperature=0.2,
    top_p=0.95,
    repetition_penalty=1.1
)

# default from https://www.youtube.com/watch?v=kSLcedGSez8&t=310s
# pipe = pipeline(
#     "text-generation",
#     model=base_model, 
#     tokenizer=tokenizer, 
#     max_length=512,
#     temperature=0.7,
#     top_p=0.95,
#     repetition_penalty=1.15
# )


# pipe = pipeline(
#     "text-generation",
#     model=base_model, 
#     tokenizer=tokenizer, 
#     max_length=512,
#     temperature=0.3,
#     top_p=0.95,
#     repetition_penalty=1.1
# )

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

CPU times: user 237 µs, sys: 55 µs, total: 292 µs
Wall time: 295 µs


In [21]:


class Prompter(object):
    __slots__ = ("template", "_verbose")

    def __init__(self, template_name: str = "", verbose: bool = False, runner: str = 'google-colab'):
        self._verbose = verbose
        
        base_path = '../templates/'
        if runner == 'google-colab':
            base_path = '/content/'
        
        
        if not template_name:
            # Enforce the default here, so the constructor can be called with '' and will not break.
            template_name = "alpaca"
        file_name = osp.join(base_path, f"{template_name}.json")
        if not osp.exists(file_name):
            raise ValueError(f"Can't read {file_name}")
        with open(file_name) as fp:
            self.template = json.load(fp)
        if self._verbose:
            print(
                f"Using prompt template {template_name}: {self.template['description']}"
            )

    def generate_prompt(
        self,
        instruction: str,
        input: Union[None, str] = None,
        label: Union[None, str] = None,
    ) -> str:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        if input:
            res = self.template["prompt_input"].format(
                instruction=instruction, input=input
            )
        else:
            res = self.template["prompt_no_input"].format(
                instruction=instruction
            )
        if label:
            res = f"{res}{label}"
        if self._verbose:
            print(res)
        return res

    def get_response(self, output: str) -> str:
        return output.split(self.template["response_split"])[1].strip()

question_prompter = Prompter(template_name="generate_question", runner="local")
answer_prompter = Prompter(template_name="generate_answer", runner="local")


In [8]:
# get data set


dataset = []
dataset_path = osp.join('../dataset/', f"{dataset_name}.json")


try:
    fp = open(dataset_path)
    dataset = json.load(fp)
except:
    dataset = []    

    
def save_dataset():
    with open(dataset_path, 'w') as fp:
        json.dump(dataset, fp, ensure_ascii=False, indent=2)
        

In [9]:


def extract_response(text: str):
    splitted = text.split('### Response:\n')
    return splitted[1].strip()


# def response_to_array(text: str):
#     pattern = r'^d\.\s{3}'
#     questions = text.rsplit(pattern);
#     qs_cleaned = [];
    
#     for q in questions: 
#         q_cleaned = re.split(pattern, q)[1]
#         qs_cleaned.append(q_cleaned)
    
#     return qs_cleaned

def fix_encoding_output(text: str):
    start_encode_pattern = r"\<ol start=\"(\d)\"\>\<li\>"
    end_encode_pattern = r"\<\/li\>\<\/ol\>"

    text = re.sub(start_encode_pattern, r"\1.   ", text)
    text = re.sub(end_encode_pattern, "", text)
    
    return text


def response_to_array(text: str):
    # clean up encoding issue

    
    
    splitter_pattern = r"^\d.\s{3}"
    responses_cleand = re.split(splitter_pattern, text, 0, re.MULTILINE);
    return responses_cleand

In [30]:

def process_raw_text(text: str):
    q_prompt = question_prompter.generate_prompt(
        '',
        text
    )
    output = pipe(q_prompt)
    # print(wrap_text_preserve_newlines(output[0]['generated_text']))
    

    output[0]['generated_text'] = fix_encoding_output(output[0]['generated_text'])
    questions = extract_response(output[0]['generated_text'])

    questions_array = response_to_array(questions)
    
    
    a_prompt = answer_prompter.generate_prompt(
        text,
        questions
    )
    output = pipe(a_prompt)
    
    output[0]['generated_text'] = fix_encoding_output(output[0]['generated_text'])
    print('\n\n')
    print(wrap_text_preserve_newlines(output[0]['generated_text']))
    answers = extract_response(output[0]['generated_text'])

    answers_array = response_to_array(answers)
    
    for i, q in enumerate(questions_array):
        if not q:
            continue
        data = {
            "instruction": questions_array[i],
            "input": "In context of Flowbird Group",
            "output": answers_array[i]
        }
        dataset.append(data);




In [11]:
# extract raw text
raw_text_path = osp.join('../dataset/', "raw.text")
raw_text = ''
try:
    f = open(raw_text_path)
    raw_text = f.read()
except:
    raw_text = ''
    
text_cases = re.split("""
---
""", raw_text, 0, re.MULTILINE)

print(json.dumps(text_cases, indent=4))

[
    "WE ARE FLOWBIRD\nFlowbird offers end to end Smart mobility, Parking and Electric Vehicle charging solutions. We provide towns and cities with the tools to make mobility in urban areas simple, multimodal and environmentally friendly, and the ability to understand and manage this mobility.\nPartnering with local administrations and operators for over 65 years, we provide apps, terminals, kerbside management, enforcement - and the software to manage these remotely. We upgrade existing physical terminals to digital terminals with new services and tariffs and open them up to integrate third party apps. Flowbird orchestrates sales, payments, reporting, invoices and fines across multiple endpoints, also providing the data insights needed to inform decisions.\n",
    "\nFLOWBIRD STATISTIC IN 2023\n80 Countries\n4,350 Cities\n350,000,000 Transactions per year\n2,000,000 Tickets sold every day\n",
    "\nFLOWBIRD global payment platform manages more than 300M banking transactions a year. 

In [46]:


# process_raw_text(text_cases[0])
# save_dataset()

for case in text_cases:
  process_raw_text(case)
  save_dataset()



In [45]:
# print(json.dumps(answers_array, indent=4))
# print(json.dumps(questions_array, indent=4))
# print(json.dumps(dataset, indent=4))

# dataset = []


# save_dataset()

[
    "",
    "Some of the challenges faced by Flowbird in implementing its solution include:\n\n*   Managing the integration of multiple systems and technologies, including mobile apps, terminals, and back-end systems.\n*   Ensuring compatibility with different types of vehicles and payment methods.\n*   Providing seamless customer service and support to users.\n\n",
    "How does Flowbird's solution address these challenges?\n\n*   By leveraging advanced technology such as machine learning and artificial intelligence to automate processes and improve user experience.\n*   By partnering with local administrations and operators to ensure compatibility and accessibility.\n*   By providing comprehensive training and support to help customers get started and use the system effectively.\n\n",
    "What are some potential benefits of using Flowbird's solution?\n\n*   Improved mobility and accessibility for people and goods in urban areas.\n*   Reduced traffic congestion and air pollution th