In [1]:
from datasets import load_dataset, concatenate_datasets, Value
from typing import List
import pandas as pd
import re
import unidecode
pd.set_option('display.max_rows', 25)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def create_column(data, name_col, value):
    new_col = [value] * len(data)
    data = data.add_column(name_col, new_col) 
    return data

def print_value_count(data, column_s):
    temp=pd.DataFrame(data[column_s], columns=[column_s])
    print(temp.groupby(column_s).value_counts())

def explode_answer(x):
    x["answers"] = x["answers"].get("text")
    return x

def remove_non_authorized_char(x):
    if type(x)==x: y = x
    else: y=str(x)
    return "".join([m if m in allowed_char else " " for m in y]) 

allowed_char = "abcdefghijklmnopqrstruvwxyz0123456789 ?,;.-!()+*_@><=:[]"

def data_preprocessing(data: pd.DataFrame, cols_clean: List[str], cols_keep: List[str]):

    data_to_clean = data[cols_clean]

    to_ascii= lambda x: unidecode.unidecode(x) if type(x)==str else unidecode.unidecode(str(x))
    data_ascii=data_to_clean.fillna(" ").map(to_ascii)

    to_lower = lambda x: x.lower() if type(x)==str else str(x).lower()
    data_lower=data_ascii.fillna(" ").map(to_lower)

    to_alphanumeric = lambda x: remove_non_authorized_char(x)
    data_alpha = data_lower.map(to_alphanumeric)

    remove_double_space = lambda x: " ".join(x.split()) if type(x)==str else  " ".join(str(x).split())
    clean_data = data_alpha.map(remove_double_space)

    clean_data[cols_keep] = data[cols_keep]

    return clean_data

----------------------------------------------------------

### Dataset summary
- None

### Supported Tasks and Leaderboards
- No specified

### Use in models
- nol2pro (fine-tuned version of t5-small on the yahoo_answers_qa dataset.)

In [3]:
dataset1 = load_dataset("yahoo_answers_qa", split = "train")
dataset1_b = create_column(dataset1, "which_data", "train")
dataset1_b = dataset1_b.filter(lambda x: (x['main_category'] == 'Business & Finance') or (x["main_category"] == "Computers & Internet"))
dataset1_b = create_column(dataset1_b, "source", "yahoo_answers_qa")
dataset1_b = create_column(dataset1_b, "type", "general questions finance")
dataset1_b = create_column(dataset1_b, "task", ["question answering tasks"])
dataset1_b = create_column(dataset1_b, "model", "nol2pro")
dataset1_b = create_column(dataset1_b, "language", "english")

dataset1_b1= create_column(dataset1_b, "context", "None")
dataset1_b1=dataset1_b1.rename_columns({"context": "context", 'question': 'question', "answer": "answer", "main_category": "question_category"})
dataset1_b1= dataset1_b1.remove_columns(["nbestanswers", "id"])

dataset1_b2 = dataset1_b1.map(lambda x: {"question_category": [x["question_category"]]})
dataset1_f = dataset1_b2.map(lambda x: {"answer": [x["answer"]]})

#### Data preprocessing

In [4]:
dataset1_fp = dataset1_f.to_pandas()

replace_char = lambda x: str(x).replace('&', ',')
dataset1_fp.loc[:,"question_category"] = dataset1_fp.question_category.map(replace_char)

cols_to_clean = ["question", "answer", "question_category", "context"]
cols_to_keep = ['which_data', 'source', 'type', 'task', 'model', 'context']
dataset1_fp = data_preprocessing(dataset1_fp, 
                                 cols_clean = cols_to_clean,
                                 cols_keep= cols_to_keep)

In [5]:
dataset1_fp.to_csv("C:\\Users\\yburt\\Documents\\dataset_tests\\yahoo_general_question_datasets.csv", sep="|")

--------------------------------------------------------------

### Dataset Summary

This dataset comes originally from kaggle. It was originally split into three tables (CSV files) (Questions, Answers, and Tags) now merged into a single table. Each row corresponds to a pair (question-answer) and their associated tags.

The dataset contains all questions asked between August 2, 2008 and Ocotober 19, 2016.

### Supported Tasks and Leaderboards

This might be useful for open-domain question-answering tasks. 

### Use in model

- llama-2-7b-finetuned-python-qa_tokenizer (no more specifications)


In [6]:
dataset2 = load_dataset("koutch/stackoverflow_python")
dataset2_b = create_column(dataset2.get("train"), "which_data", "train")
dataset2_b = create_column(dataset2_b, "source", "koutch/stackoverflow_python")
dataset2_b = create_column(dataset2_b, "type", "technical questions")
dataset2_b = create_column(dataset2_b, "task", ["question answering tasks"])
dataset2_b = create_column(dataset2_b, "model", "llama-2-7b-finetuned-python-qa_tokenizer")
dataset2_b = create_column(dataset2_b, "language", "english")

dataset2_b1=dataset2_b.rename_columns({'title': "context", 'question_body': 'question', "answer_body": "answer", "tags": "question_category"})
dataset2_b1= dataset2_b1.remove_columns(["question_score", "question_id", "question_date", "answer_id", "answer_score", "answer_date"])
dataset2_f = dataset2_b1.map(lambda x: {"answer": [x["answer"]]})

dataset2_splitted = dataset2_f.train_test_split(test_size=0.9, shuffle=False, seed=1256)
dataset2_split_1= dataset2_splitted.get("train").train_test_split(train_size=19742, shuffle=False, seed=1256)
#dataset2_split_1.get("train").to_csv("C:\\Users\\yburt\\Documents\\dataset_tests\\kaggle_dataset_part1.csv", sep="|")

#### Data preprocessing

In [7]:
def funtion_dataset2(data: pd.DataFrame):

    regex_pattern = r"<p>|</p>\n\n<p>|<p>|\\r\\n|\\n|\\r|\\|</em|</p>\n\n<ul>\n<li>|</li>\n<li>|</li>\n</ul>\n|</p|</a>|>|<a|sips|\|"
    replace_char = lambda x: re.sub(regex_pattern, " ", str(x))
    data.loc[:,("question", "answer", "context")] = data[["question", "answer", "context"]].map(replace_char)

    cols_to_clean = ["question", "answer", "context", "question_category"]
    cols_to_keep = ['which_data', 'source', 'type', 'task', 'model', 'context']
    data = data_preprocessing(
        data, 
        cols_clean = cols_to_clean,
        cols_keep= cols_to_keep)
    return data

In [8]:
dataset2_split_1p = dataset2_split_1.get("train").to_pandas()
dataset2_split_1p = funtion_dataset2(dataset2_split_1p)
dataset2_split_1p.to_csv("C:\\Users\\yburt\\Documents\\dataset_tests\\kaggle_dataset_part1" +".csv", sep="|")

In [9]:
data_temp = dataset2_split_1
for i in range(2, 5):
    dataset_split = data_temp.get("test").train_test_split(train_size=19742, shuffle=False, seed=1256)
    data_temp = dataset_split
    data = dataset_split.get("train").to_pandas()
    data = funtion_dataset2(data)
    data.to_csv("C:\\Users\\yburt\\Documents\\dataset_tests\\kaggle_dataset_part" + str(i) +".csv", sep="|")

In [10]:
dataset2_split_5 = dataset2_splitted.get("test").train_test_split(train_size=19742, shuffle=False, seed=1256)
dataset2_split_5p = dataset2_split_5.get("train").to_pandas()
dataset2_split_5p = funtion_dataset2(dataset2_split_5p)
dataset2_split_5p.to_csv("C:\\Users\\yburt\\Documents\\dataset_tests\\kaggle_dataset_part5.csv", sep="|")

data_temp = dataset2_split_5
for i in range(1, 45):
    dataset_split = data_temp.get("test").train_test_split(train_size=19742, shuffle=False, seed=1256)
    data_temp = dataset_split
    data = dataset_split.get("train").to_pandas()
    data = funtion_dataset2(data)
    data.to_csv("C:\\Users\\yburt\\Documents\\dataset_tests\\kaggle_dataset_part" + str(i + 4) +".csv", sep="|")

-------------------------------------------------

#### SQuAD-fr:

- a translated version of the Stanford Question Answering Dataset (SQuAD) into French obtained through automatic translation of the English dataset
- a reading comprehension dataset, consisting of approximately 90K factoid questions on Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage serves as a means of data augmentation on FQuAD and PIAF benchmarks


#### Supported Tasks and Leaderboards
closed-domain-qa, text-retrieval: This dataset is intended to be used for closed-domain-qa, but can also be used for information retrieval tasks.

### Use in models

- No communication

In [11]:
dataset3 = load_dataset("qwant/squad_fr")
dataset3 = dataset3.map(explode_answer)

dataset3_b = create_column(dataset3.get("train"), "which_data", "train")
dataset3_b = create_column(dataset3_b, "source", "qwant/squad_fr")
dataset3_b = create_column(dataset3_b, "type", "general questions")
dataset3_b = create_column(dataset3_b, "task", ["question answering tasks", "text-retrieval"])
dataset3_b = create_column(dataset3_b, "model", "no communication")
dataset3_b = create_column(dataset3_b, "language", "french")

dataset3_b=dataset3_b.filter(lambda example: (example['title']== 'Energy')
                 or (example["title"]=="Communication")
                 or (example["title"]=="Computer")
                 or (example["title"]=="Computer_security")
                 or (example["title"]=="Economic_inequality")
                 or (example["title"]=="European_Union_law"))

In [12]:
dataset3_t = create_column(dataset3.get("validation"), "which_data", "validation")
dataset3_t = create_column(dataset3_t, "source", "qwant/squad_fr")
dataset3_t = create_column(dataset3_t, "type", "general questions")
dataset3_t = create_column(dataset3_t, "task", ["question answering tasks", "text-retrieval"])
dataset3_t = create_column(dataset3_t, "model", "no communication")

In [13]:
dataset3_c = concatenate_datasets([dataset3_t, dataset3_b])
dataset3_c = create_column(dataset3_c, "question_category", "None")
dataset3_c = dataset3_c.rename_columns({"context": "context", 'question': 'question', "answers": "answer"})
dataset3_c = dataset3_c.remove_columns(["title", "id"])
dataset3_f = dataset3_c.map(lambda x: {"question_category": [x["question_category"]]})

#### Data preprocessing

In [14]:
dataset3_fp = dataset3_f.to_pandas()
cols_to_clean = ["question", "answer", "question_category", "context"]
cols_to_keep = ['which_data', 'source', 'type', 'task', 'model', 'context']
dataset3_fp = data_preprocessing(dataset3_fp, 
                                 cols_clean = cols_to_clean,
                                 cols_keep= cols_to_keep)

In [15]:
dataset3_fp.to_csv("C:\\Users\\yburt\\Documents\\dataset_tests\\standord_translated_dataset.csv", sep="|")

----------------------------------------------------
### Dataset Summary
We are thrilled to announce the release of the OpenOrca dataset! This rich collection of augmented FLAN data aligns, as best as possible, with the distributions outlined in the Orca paper. It has been instrumental in generating high-performing model checkpoints and serves as a valuable resource for all NLP researchers and developers!

### Supported Tasks and Leaderboards

This dataset supports a range of tasks including language modeling, text generation, and text augmentation. It has been instrumental in the generation of multiple high-performing model checkpoints which have exhibited exceptional performance in our unit testing. 

### Use in model

- Fine tuning on top of Mistral

In [16]:
dataset4 = load_dataset("Open-Orca/OpenOrca")
dataset4_b = create_column(dataset4.get("train"), "which_data", "train")
dataset4_b = create_column(dataset4_b, "source", "Open-Orca/OpenOrca")
dataset4_b = create_column(dataset4_b, "type", "general questions")
dataset4_b = create_column(dataset4_b, "task", ["language modeling", "text generation", "text augmentation"])
dataset4_b = create_column(dataset4_b, "model", "fine tuned mistral 7B")
dataset4_b = create_column(dataset4_b, "language", "english")

In [17]:
dataset4_b1=dataset4_b.rename_columns({"system_prompt": "context", 'question': 'question', "response": "answer"})
dataset4_b1= dataset4_b1.remove_columns(["id"])
dataset4_b1 = create_column(dataset4_b1, "question_category", "None")

dataset4_b2 = dataset4_b1.map(lambda x: {"question_category": [x["question_category"]]})
dataset4_f = dataset4_b2.map(lambda x: {"answer": [x["answer"]]})

#### Data preprocessing

In [18]:
def funtion_dataset4(data: pd.DataFrame):

    regex_pattern = r"[\n|\n]|\n"
    replace_char = lambda x: re.sub(regex_pattern, " ", str(x))
    data.loc[:,("question", "answer")] = data[["question", "answer"]].map(replace_char)

    cols_to_clean = ["question", "answer", "context", "question_category"]
    cols_to_keep = ['which_data', 'source', 'type', 'task', 'model', 'context']
    data = data_preprocessing(
        data, 
        cols_clean = cols_to_clean,
        cols_keep= cols_to_keep)
    return data

In [19]:
dataset4_splitted = dataset4_f.train_test_split(test_size=0.9, shuffle=False, seed=1256)
dataset4_split_1= dataset4_splitted.get("train").train_test_split(train_size=19742, shuffle=False, seed=1256)

dataset4_split_1p = dataset4_split_1.get("train").to_pandas()
dataset4_split_1p = funtion_dataset4(dataset4_split_1p)
dataset4_split_1p.to_csv("C:\\Users\\yburt\\Documents\\dataset_tests\\mistral_replica_part1.csv", sep="|")

data_temp = dataset4_split_1
for i in range(2, 5):
    dataset_split = data_temp.get("test").train_test_split(train_size=19742, shuffle=False, seed=1256)
    data_temp = dataset_split
    dataset_splitp = dataset_split.get("train").to_pandas()
    dataset_splitp = funtion_dataset4(dataset_splitp)
    dataset_splitp.to_csv("C:\\Users\\yburt\\Documents\\dataset_tests\\mistral_replica_part" + str(i) +".csv", sep="|")

In [20]:
dataset4_split_5 = dataset4_splitted.get("test").train_test_split(train_size=19742, shuffle=False, seed=1256)
dataset4_split_5p = dataset4_split_5.get("train").to_pandas()
dataset4_split_5p = funtion_dataset4(dataset4_split_5p)
dataset4_split_5p.to_csv("C:\\Users\\yburt\\Documents\\dataset_tests\\mistral_replica_part5.csv", sep="|")

data_temp = dataset4_split_5
for i in range(1, 100):
    dataset_split = data_temp.get("test").train_test_split(train_size=19742, shuffle=False, seed=1256)
    data_temp = dataset_split
    dataset_splitp = dataset_split.get("train").to_pandas()
    dataset_splitp = funtion_dataset4(dataset_splitp)
    dataset_splitp.to_csv("C:\\Users\\yburt\\Documents\\dataset_tests\\mistral_replica_part" + str(i + 4) +".csv", sep="|")

-------------------------------------------------------------------
### Dataset Summary

FinTalk-19k is a domain-specific dataset designed for the fine-tuning of Large Language Models (LLMs) with a focus on financial conversations. Extracted from public Reddit conversations, this dataset is tagged with categories like "Personal Finance", "Financial Information", and "Public Sentiment". It consists of more than 19,000 entries, each representing a conversation about financial topics.

### Supported Tasks and Leaderboards

- language-modeling: The dataset can be used to train models for language modeling in the context of financial discussions.
- text-generation: Suitable for generating responses in financial conversations.

### Languages

The dataset is primarily in English.

### Sources: from REDDIT

### Use in model
- No communication

In [21]:
dataset5 = load_dataset("ceadar-ie/FinTalk-19k")
dataset5_b = create_column(dataset5.get("train"), "which_data", "train")
dataset5_b = create_column(dataset5_b, "source", "ceadar-ie/FinTalk-19k")
dataset5_b = create_column(dataset5_b, "type", "general financial questions")
dataset5_b = create_column(dataset5_b, "task", ["question answering tasks"])
dataset5_b = create_column(dataset5_b, "model", "no communication")
dataset5_b = create_column(dataset5_b, "language", "english")

In [22]:
dataset5_b1=dataset5_b.rename_columns({"context": "context", 'instruction': 'question', "response": "answer", "tag":"question_category"})
dataset5_b2 = dataset5_b1.map(lambda x: {"question_category": [x["question_category"]]})
dataset5_f = dataset5_b2.map(lambda x: {"answer": [x["answer"]]})

#### Data preprocessing

In [23]:
dataset5_fp = dataset5_f.to_pandas()
dataset5_fp = data_preprocessing(dataset5_fp, 
                                 cols_clean = ["question", "answer", "context", "question_category"],
                                 cols_keep= ["which_data", "source", "type", "task", "model", "language"])

In [24]:
dataset5_fp.to_csv("C:\\Users\\yburt\\Documents\\dataset_tests\\reddit_technical.csv", sep="|")

-------------------------------------------------------------------
### Dataset Summary

The x-stance dataset contains more than 150 political questions, and 67k comments written by candidates on those questions. The comments are partly German, partly French and Italian. The data have been extracted from the Swiss voting advice platform Smartvote.

### Supported Tasks and Leaderboards

- fast checking

### Model in use
-  mDeBERTa-v3-base-tasksource-nli 

In [25]:
dataset6 = load_dataset("strombergnlp/x-stance", "fr")
dataset6_b = create_column(dataset6.get("validation"), "which_data", "validation")
dataset6_b = create_column(dataset6_b, "source", "strombergnlp/x-stance")
dataset6_b = create_column(dataset6_b, "type", "political questions")
dataset6_b = create_column(dataset6_b, "task", ["question answering tasks"])
dataset6_b = create_column(dataset6_b, "model", "mDeBERTa-v3-base-tasksource-nli")

In [26]:
dataset6_t = create_column(dataset6.get("test"), "which_data", "test")
dataset6_t = create_column(dataset6_t, "source", "strombergnlp/x-stance")
dataset6_t = create_column(dataset6_t, "type", "political questions")
dataset6_t = create_column(dataset6_t, "task", ["question answering tasks"])

In [27]:
dataset6_q = create_column(dataset6.get("train"), "which_data", "train")
dataset6_q = create_column(dataset6_q, "source", "strombergnlp/x-stance")
dataset6_q = create_column(dataset6_q, "type", "political questions")
dataset6_q = create_column(dataset6_q, "task", ["question answering tasks"])
dataset6_q = create_column(dataset6_q, "model", "mDeBERTa-v3-base-tasksource-nli")

In [28]:
dataset6_c = concatenate_datasets([dataset6_q, dataset6_t, dataset6_b])
dataset6_c = create_column(dataset6_c, "question_category", "None")
dataset6_c = create_column(dataset6_c, "language", "french")
dataset6_c = dataset6_c.rename_columns({'label': "context", 'question': 'question', "comment": "answer"})
dataset6_c = dataset6_c.remove_columns(["id"])

dataset6_c1 = dataset6_c.map(lambda x: {"question_category": [x["question_category"]]})
dataset6_c2 = dataset6_c1.map(lambda x: {"answer": [x["answer"]]})
dataset6_f = dataset6_c2.cast_column('context', Value(dtype='string', id=None))

In [29]:
dataset6_f = dataset6_f.to_pandas()
dataset6_f = data_preprocessing(dataset6_f, 
                                 cols_clean = ["question", "answer", "context", "question_category"],
                                 cols_keep= ["which_data", "source", "type", "task", "model", "language"])
dataset6_f.to_csv("C:\\Users\\yburt\\Documents\\dataset_tests\\political_questions.csv", sep="|")

----------------------------------------------------------------------------------------------