# Tweeteval stance detection

In [1]:
import pandas as pd
from pathlib import Path
#import swifter
from scipy.stats import entropy
from collections import Counter

In [2]:
!git clone https://github.com/cardiffnlp/tweeteval.git

Cloning into 'tweeteval'...
remote: Enumerating objects: 370, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 370 (delta 13), reused 3 (delta 1), pack-reused 354[K
Receiving objects: 100% (370/370), 8.49 MiB | 8.10 MiB/s, done.
Resolving deltas: 100% (122/122), done.


In [3]:
DATA_DIR = Path("tweeteval") / "datasets"
STANCE_DIR = DATA_DIR / "stance"
GPT_INPUT_PATH = Path("gpt-input")

GPT_INPUT_PATH.mkdir(exist_ok=True)

In [4]:
TOPICS = [
    "abortion",
    "atheism",
    "climate",
    "feminist",
    "hillary",
]

TOPIC_MAP = {
    "abortion": "abortion",
    "atheism": "atheism",
    "feminist": "feminism",
    "hillary": "hillary",
    "climate": "climate"
}

MAPPING = {
    0: "none",
    1: "against",
    2: "favor"
}

RESULTS = ["0", "1", "2"]

## Structure

In [5]:
def load_test_data(dataset_path: Path, topic):
    df_text = pd.read_csv(dataset_path / "test_text.txt", sep="delimiter", header=None, names=["text"])
    df_text['topic'] = topic
    df_labels = pd.read_csv(dataset_path / "test_labels.txt", sep="delimiter", header=None, names=["labels"])
    return pd.concat([df_text, df_labels], axis=1)

In [None]:

dfs = [load_test_data(STANCE_DIR / topic, topic) for topic in TOPICS]


df = pd.concat(dfs, ignore_index=True)

df

In [7]:
entropy(list(Counter(df.labels).values())) # dataset entropy

0.9748409598405161

In [None]:
def load_train_data(dataset_path: Path, topic):
    df_text = pd.read_csv(dataset_path / "train_text.txt", sep="delimiter", header=None, names=["text"])
    df_text['topic'] = topic
    df_labels = pd.read_csv(dataset_path / "train_labels.txt", sep="delimiter", header=None, names=["labels"])
    return pd.concat([df_text, df_labels], axis=1)

random_examples_per_label = {topic: {} for topic in TOPICS}

for topic in TOPICS:
  nu = load_train_data(STANCE_DIR / topic, topic)
  # Group by label and sample 10 random examples for each label in not_selected
  for label, group in nu.groupby('labels'):
    random_examples_per_label[topic][label] = group.sample(n=10, random_state=1) if len(group) >= 10 else group

random_examples_per_label

## Prompts preparation

In [9]:
def get_query(row, shots):
  text=row['text']
  pattern = f"Knowing that the majority of people described the stance from the following tweets about {row['topic']} with one among {MAPPING}:\n\n"
  for lab in range(3):
    for ex in range(shots):
      pattern = pattern + 'tweet: '+random_examples_per_label[row['topic']][lab].iloc[ex]['text'] + ' stance: '+str(random_examples_per_label[row['topic']][lab].iloc[ex]['labels'])+'\n\n'
  pattern = pattern + f"\nDescribe the stance of the given tweet about {row['topic']}. Choose your answer from {MAPPING} and return an integer as a result. Do not explain yourself. \n\n"+f"Tweet: {text}."
  return pattern

In [10]:
def generate_dataframe(df: pd.DataFrame):
  for shots in range(2,11,2):
      column_name = f'prompt{shots}'
      df[column_name] = df.apply(get_query, axis=1, args=(shots,))
  return df


In [11]:
new_df = generate_dataframe(df)
new_df.to_csv(GPT_INPUT_PATH / "23_stance_prompts_few_shots.csv", index=False)