# WhatsLLama Dataset
This notebook converts messages from Whatsapp into a dataset that can be used to train an LLaMa model. 

In [None]:
# reboot required because of incompatible numpy versions on colab
!pip install whatstk

In [None]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
import os
import random
from whatstk import WhatsAppChat

# Config

In [None]:
username = "## UPDATE USERNAME HERE ##"
# location of whatsapp chat exports this directory should only contain .txt files exported from whatsapp: 
path = "## UPDATE PATH HERE ##"
sys_prompt = "a student from zurich and you speak swiss german. Always answer in swiss german based on the conversation you have seen"

# Data preperation
if running in colab execute below cell to mount google drive else skip this

In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
file_list = os.listdir(path)
file_list = txt_files = [f for f in file_list if f.endswith('.txt')]
print(file_list)

df = pd.DataFrame()
for file_name in file_list:
    filepath = os.path.join(path, file_name)
    chat = WhatsAppChat.from_source(filepath=filepath)
    df_tmp = chat.df
    df_tmp['conv-id'] = file_name[:-4]  # remove .txt
    df = pd.concat([df, df_tmp], ignore_index=True)

In [None]:
# get rid of whatsapp generated messages. This depends on the app language
df = df[~df['message'].str.contains('omitted', case=False, na=False)]
df = df[~df['message'].str.contains('Your security code with', case=False, na=False)]
df = df[~df['message'].str.contains('changed this group', case=False, na=False)]
df = df[~df['message'].str.contains('end-to-end encrypted', case=False, na=False)]
df = df[~df['message'].str.contains('added you', case=False, na=False)]
df = df[~df['message'].str.contains('created group', case=False, na=False)]
df = df[~df['message'].str.contains('This message was deleted.', case=False, na=False)]
df = df[~df['message'].str.contains('You deleted this message', case=False, na=False)]
df = df[~df['message'].str.contains('Waiting for this message', case=False, na=False)]

In [None]:
raw_whatsapp_df = df
df.describe()

## Merge subsequent messages
Often in instant messaging multiple messages about the same topic will be sent individually. Concat the message together if they are within the same conversation and right after each other

In [None]:
df['date'] = pd.to_datetime(df['date'])
def assign_group_id(row):
    if pd.isna(row['time_diff']) or row['time_diff'] > pd.Timedelta(minutes=2) or row['user_change']:
        return row.name

    else:
        return np.nan


def process_conversation(sub_df):
    sub_df['time_diff'] = sub_df['date'].diff()
    sub_df['user_change'] = sub_df['username'] != sub_df['username'].shift(fill_value=sub_df['username'].iloc[0])
    sub_df['group_id'] = sub_df.apply(assign_group_id, axis=1)
    sub_df['group_id'].fillna(method='ffill', inplace=True)
    return sub_df


df = df.groupby('conv-id').apply(process_conversation).reset_index(drop=True)

grouped_conv_final = df.groupby(['group_id', 'conv-id']).agg(
    date=('date', 'first'),
    username=('username', 'first'),
    message=('message', lambda x: '\n'.join(x)),
    conv=('conv-id', 'first')

).reset_index(drop=True)
df = grouped_conv_final

In [None]:
grouped_conv_final.tail(100)

## Convert to Alpaca format



In [None]:
instruction = f"Continue this conversation as {username}"
df_user = df[df["username"] == username]

# Take each message from the user and select a random number of preceding messages to add to the input
def create_input_string(row):
    num_messages_to_pick = random.randint(1, min(10, row["preceding_count"]))

    preceding_messages = df[(df["conv"] == row["conv"]) & (df["date"] < row["date"])].tail(num_messages_to_pick)[
        ["username", "message"]]

    return "\n".join(
        [f"{user}: {msg}" for user, msg in zip(preceding_messages["username"], preceding_messages["message"])])


# num of preceding msg of user
df_user = df_user.assign(
    preceding_count=df_user.apply(lambda row: df[(df["conv"] == row["conv"]) & (df["date"] < row["date"])].shape[0],
                                  axis=1))

df_user = df_user[df_user["preceding_count"] > 0]
df_user["input"] = df_user.apply(create_input_string, axis=1)
df_user["output"] = f"{username}: " + df_user["message"]

df_alpaca = df_user.assign(instruction=instruction)[["instruction", "input", "output"]]


In [None]:
df_alpaca.to_csv("whatsapp-alpaca.csv", index=False)

# Convert the datset from the alpaca format into LLaMa2 Prompt template
As described here
\<s> [INST] <<SYS>>You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. Please answer in the same language as the user.
<<\/SYS>>

This is a test question[/INST] This is a answer \</s>


In [None]:
df = pd.read_csv('whatsapp-alpaca.csv')


def format_data(row):
    inst = row['instruction']
    # get rid of the upside exclamation marks -> do not make sense as an name
    inp = row['input'].replace('\n', ' ').replace('\r', ' ').replace('ol¡', 'oli')
    outp = row['output'].replace('\n', ' ').replace('\r', ' ').replace('ol¡', 'oli')
    return f"<s>[INST]<<SYS>>You are {username}, {sys_prompt}<</SYS>> {inst} {inp} [/INST] {outp}</s>"


df['text'] = df.apply(format_data, axis=1)

# Filter out nonsense msg's

# remove overly long messages-> often not useful, copy-pasted text, etc
df = df.loc[df['text'].str.len() <= 10000]


def check_word_length(text, max_length):
    return all(len(word) <= max_length for word in text.split())


# remove rows that contain words that are more than 20chars -> in my messages they are mostly not useful
df = df.loc[df['text'].apply(lambda x: check_word_length(x, 20))]

df['text'].to_csv('train-llama-prompts.csv', index=False, header=True)

# Convert to other Alpaca Format
This is used to fine-tune the model: https://huggingface.co/flozi00/Llama-2-7b-german-assistant-v3 as it was trained on a slightly different format

"### Assistant:" "### User:"

In [None]:
import re

data = pd.read_csv('train-llama-prompts.csv', header=1) 

def process_row(row):
    # Use regular expressions to extract the text between [INST] and [/INST], and the text after [/INST]
    match = re.match(r'<s>\[INST\](.*?)\[/INST\](.*?)</s>', row[0])
    if match:
        instruction_text, response_text = match.groups()
        
        return f"### User: {instruction_text}\n</s>### Assistant: {response_text} </s>"


text_to_remove = "<<SYS>>You are oli, a student from zurich and you speak swiss german. Always answer in swiss german based on the conversation you have seen<</SYS>> Continue this conversation as oli "

data['text'] = data.apply(process_row, axis=1)
data['text'] = data['text'].str.replace(text_to_remove, "")
data['text'] = data['text'].str.replace('\n', "")

In [None]:
data['text'].to_csv('train-other-alpaca-prompts.csv', index=False, header=True)

# Visualization & Key numbers
Section to create visualizations and get some key figures about the generated dataset

In [None]:
import matplotlib.pyplot as plt
import nltk

from wordcloud import WordCloud
from collections import Counter
from collections import defaultdict
from nltk.corpus import stopwords
from nltk import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

# Key Figures

In [None]:
print(f'Size of the dataset: {len(df["text"])}')
print(f'Size of raw msg:{len(raw_whatsapp_df)}')
print(f'Num of conversations:{len(raw_whatsapp_df["conv-id"].unique())}')
print("Number of msg per input",sum(x.count("\n") for x in df_alpaca["input"])/len(df["text"]))
raw_whatsapp_df.describe()

Information about the messages the user has written

In [None]:
user = raw_whatsapp_df[raw_whatsapp_df['username']==username]

## Words per messages

In [None]:
lengths = user['message'].str.split().map(lambda x: len(x))

plt.figure(figsize=(12, 8))

plt.hist(lengths, bins=range(0, 20), alpha=0.7, edgecolor='black',color='blue')

plt.xlim((1,20))

plt.xlabel('Number of Words')
plt.xticks([x + 0.5 for x in range(0, 20)], range(0, 20))
plt.ylabel('Frequency')

plt.title('Words per Message')

plt.show()

## Length of Words

In [None]:
user['Word Lengths'] = user['message'].apply(lambda msg: [len(word) for word in str(msg).split()])

plt.figure(figsize=(12, 8))
flat_data = [item for sublist in user['Word Lengths'] for item in sublist]
plt.hist(flat_data, bins=range(0, 20), alpha=0.7, edgecolor='black',color='blue')

plt.xlim((1,20))
plt.xlabel('Length of Words')
plt.xticks([x + 0.5 for x in range(0, 20)], range(0, 20))
plt.ylabel('Frequency')

plt.title('Length of Words')

plt.show()

In [None]:
#Count words and prepare stop words
stop=set(stopwords.words('german'))

user['tokenized_message'] = user['message'].apply(lambda x: word_tokenize(x.lower()))
user['filtered_message'] = user['tokenized_message'].apply(lambda x: [word for word in x if word.isalpha()])

corpus = [word for sublist in user['filtered_message'] for word in sublist]

counter=Counter(corpus)
most=counter.most_common()

## Most frequent words

In [None]:
x, y= [], []
for word,count in most[:40]:
        x.append(word)
        y.append(count)

plt.figure(figsize=(12, 8))
plt.bar(x, y, color='blue')

plt.xlabel('Words')
plt.ylabel('Counts')
plt.title('Most Frequent Words')

plt.xticks(rotation=45)

plt.show()

## Most Frequent Words without Stop Words

In [None]:
x, y= [], []
for word,count in most[:40]:
    if word not in stop:
        x.append(word)
        y.append(count)

plt.figure(figsize=(12, 8))
plt.bar(x, y, color='blue')

plt.xlabel('Words')
plt.ylabel('Counts')
plt.title('Most Frequent Words without German Stop Words')

plt.xticks(rotation=45)

plt.show()

## Most Frequent Stop Words

In [None]:
from nltk.corpus import stopwords
dic=defaultdict(int)

for word in corpus:
    if word in stop:
        dic[word]+=1
        
top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] 
x,y=zip(*top)
plt.figure(figsize=(12, 8))
plt.bar(x, y, color='blue')
plt.xlabel('German Stop Words')
plt.ylabel('Counts')
plt.title('Most Frequent Stop Words')

plt.xticks(rotation=45)

plt.show()

## Generate Word Cloud

In [None]:
text = " ".join(txt for txt in corpus if txt not in stop)
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

# Display the generated image:
plt.figure(figsize=(10, 5), dpi=100)  # Increase dpi for higher resolution
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()