From b86bece102928fdf3a0be71364059cff200d70ba Mon Sep 17 00:00:00 2001 From: Jiahang Li Date: Sun, 16 Apr 2023 20:26:53 -0700 Subject: [PATCH] add custom dataset dwmw17 --- datasets/download_text_classification.sh | 77 ++++++++++--------- .../data_utils/text_classification_dataset.py | 33 ++++++++ 2 files changed, 74 insertions(+), 36 deletions(-) diff --git a/datasets/download_text_classification.sh b/datasets/download_text_classification.sh index 3f654d1..1d3e6f3 100755 --- a/datasets/download_text_classification.sh +++ b/datasets/download_text_classification.sh @@ -3,41 +3,46 @@ DIR="./TextClassification" mkdir $DIR cd $DIR -rm -rf mnli -wget --content-disposition https://cloud.tsinghua.edu.cn/f/33182c22cb594e88b49b/?dl=1 -tar -zxvf mnli.tar.gz -rm -rf mnli.tar.gz - -rm -rf agnews -wget --content-disposition https://cloud.tsinghua.edu.cn/f/0fb6af2a1e6647b79098/?dl=1 -tar -zxvf agnews.tar.gz -rm -rf agnews.tar.gz - -rm -rf dbpedia -wget --content-disposition https://cloud.tsinghua.edu.cn/f/362d3cdaa63b4692bafb/?dl=1 -tar -zxvf dbpedia.tar.gz -rm -rf dbpedia.tar.gz - -rm -rf imdb -wget --content-disposition https://cloud.tsinghua.edu.cn/f/37bd6cb978d342db87ed/?dl=1 -tar -zxvf imdb.tar.gz -rm -rf imdb.tar.gz - -rm -rf SST-2 -wget --content-disposition https://cloud.tsinghua.edu.cn/f/bccfdb243eca404f8bf3/?dl=1 -tar -zxvf SST-2.tar.gz -rm -rf SST-2.tar.gz - -rm -rf amazon -wget --content-disposition https://cloud.tsinghua.edu.cn/f/e00a4c44aaf844cdb6c9/?dl=1 -tar -zxvf amazon.tar.gz -mv datasets/amazon/ amazon -rm -rf ./datasets -rm -rf amazon.tar.gz - -rm -rf yahoo_answers_topics -wget --content-disposition https://cloud.tsinghua.edu.cn/f/79257038afaa4730a03f/?dl=1 -tar -zxvf yahoo_answers_topics.tar.gz -rm -rf yahoo_answers_topics.tar.gz +# rm -rf mnli +# wget --content-disposition https://cloud.tsinghua.edu.cn/f/33182c22cb594e88b49b/?dl=1 +# tar -zxvf mnli.tar.gz +# rm -rf mnli.tar.gz + +# rm -rf agnews +# wget --content-disposition https://cloud.tsinghua.edu.cn/f/0fb6af2a1e6647b79098/?dl=1 +# tar -zxvf agnews.tar.gz +# rm -rf agnews.tar.gz + +# rm -rf dbpedia +# wget --content-disposition https://cloud.tsinghua.edu.cn/f/362d3cdaa63b4692bafb/?dl=1 +# tar -zxvf dbpedia.tar.gz +# rm -rf dbpedia.tar.gz + +# rm -rf imdb +# wget --content-disposition https://cloud.tsinghua.edu.cn/f/37bd6cb978d342db87ed/?dl=1 +# tar -zxvf imdb.tar.gz +# rm -rf imdb.tar.gz + +# rm -rf SST-2 +# wget --content-disposition https://cloud.tsinghua.edu.cn/f/bccfdb243eca404f8bf3/?dl=1 +# tar -zxvf SST-2.tar.gz +# rm -rf SST-2.tar.gz + +# rm -rf amazon +# wget --content-disposition https://cloud.tsinghua.edu.cn/f/e00a4c44aaf844cdb6c9/?dl=1 +# tar -zxvf amazon.tar.gz +# mv datasets/amazon/ amazon +# rm -rf ./datasets +# rm -rf amazon.tar.gz + +# rm -rf yahoo_answers_topics +# wget --content-disposition https://cloud.tsinghua.edu.cn/f/79257038afaa4730a03f/?dl=1 +# tar -zxvf yahoo_answers_topics.tar.gz +# rm -rf yahoo_answers_topics.tar.gz + +rm -rf dwmw17 +wget --content-disposition https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv +mkdir -p dwmw17 +mv labeled_data.csv dwmw17 cd .. diff --git a/openprompt/data_utils/text_classification_dataset.py b/openprompt/data_utils/text_classification_dataset.py index 1dde85c..f9ca7ce 100644 --- a/openprompt/data_utils/text_classification_dataset.py +++ b/openprompt/data_utils/text_classification_dataset.py @@ -17,6 +17,7 @@ import os import json, csv +import pandas as pd from abc import ABC, abstractmethod from collections import defaultdict, Counter from typing import List, Dict, Callable @@ -27,6 +28,38 @@ from openprompt.data_utils.data_processor import DataProcessor +class Dwmw17Processor(DataProcessor): + def __init__(self): + super().__init__() + self.labels = [ "hate speech", "offensive language", "neither" ] + + def get_examples(self, data_dir, split): + df = pd.read_csv(os.path.join(data_dir, 'labeled_data.csv')) + """ + 24783 rows in total, 0: 1430, 1: 19190, 2: 4163 + I will take 50% as training and 50% as testing + """ + train_splits = [ 715, 9595, 2081 ] + examples = [] + for label_idx in range(len(self.labels)): + df_label = df[df['class'] == label_idx] + train_split = train_splits[label_idx] + + tweets = df_label['tweet'].tolist() + indexs = df_label.iloc[:, 0].tolist() + if split == 'train': + tweets_split = tweets[:train_split] + indexs_split = indexs[:train_split] + else: + tweets_split = tweets[train_split:] + indexs_split = indexs[train_split:] + for tweet, index in zip(tweets_split, indexs_split): + examples.append(InputExample( + guid=str(index), text_a=tweet, text_b="", label=label_idx + )) + return examples + + class MnliProcessor(DataProcessor): # TODO Test needed def __init__(self):