# Data Preparation

Use the full dataset

In [1]:
import collections
import numpy as np
import pandas as pd
import re
import emoji
import string 
from nltk.corpus import stopwords
# from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

from argparse import Namespace

In [2]:
args = Namespace(                               # Create key, value pairs, and access values later on, e.g., args.seed
    full_dataset = "file.csv",
    raw_train_dataset_csv="train.csv",
    raw_test_dataset_csv="test.csv",
    train_proportion=0.7,
    val_proportion=0.3,
    output_munged_csv="good_bad_with_splits_full.csv",
    seed=1337
)

In [3]:
# Split dataset into train and test
df = pd.read_csv(args.full_dataset, names=['review','labels'])
df = df.tail(-1)
df = df.drop(df[df['labels'].isin(['neutral'])].index)
df = df.reset_index(drop=True)
df_good = pd.DataFrame(columns = ['review','labels'])
df_bad = pd.DataFrame(columns = ['review','labels'])
df = df.reset_index(drop=True)
for i in range(len(df)):
    if df['labels'][i] == 'good':  
        df_good = pd.concat([df_good, df.iloc[i:i+1]])    
    else:
        df_bad = pd.concat([df_bad, df.iloc[i:i+1]])  
bad = df_bad.sample(20000, random_state=42)
good= df_good.sample(20000, random_state=42)
df = pd.DataFrame(columns = ['review','labels'])
df = pd.concat([good,bad])
print(df)
# for i in range(len(df)):
#     if df['labels'][i] =='good':
#         print(df['labels'][i])
#         df = df.sample(n=20000)
#     else:
#         df = df.sample(n=20000)
# print(df)
df['split'] = np.random.randn(df.shape[0], 1)
msk = np.random.rand(len(df)) <= 0.7
train = df[msk]
test = df[~msk]

# Save train and test dataset to .csv
train.drop('split', axis=1).to_csv(args.raw_train_dataset_csv, index=False, header=False)
test.drop('split', axis=1).to_csv(args.raw_test_dataset_csv, index=False, header=False)

# Read raw data
train_reviews = pd.read_csv(args.raw_train_dataset_csv, header=None, names=['review','labels'])   # no header in the file
train_reviews = train_reviews[~pd.isnull(train_reviews.review)]                                    # remove empty reviews
test_reviews = pd.read_csv(args.raw_test_dataset_csv, header=None, names=['review','labels'])
test_reviews = test_reviews[~pd.isnull(test_reviews.review)]
# df

                                                   review labels
33119   ChatGPT use case - breaking through writers or...   good
46705   Sod it I'll put it here too, it's just that Go...   good
98240   ChatGPT is at capacity right now \nWrite a lim...   good
2185    #ChatGPT for cooking inspiration https://t.co/...   good
89625   Teachers friends, if you want to see how much ...   good
...                                                   ...    ...
14862   ChatGPT doesn't only pass Turing's test, but w...    bad
114962  (1/2) ChatGPT is an incredible tool, but when ...    bad
33036   Explained: What is ChatGPT, how it works and c...    bad
137603          Google vs ChatGPT https://t.co/jSvjwOIjX8    bad
103832  Following an example I reposted on LinkedIn to...    bad

[40000 rows x 2 columns]


In [4]:
train_reviews.head()

Unnamed: 0,review,labels
0,ChatGPT use case - breaking through writers or...,good
1,"Sod it I'll put it here too, it's just that Go...",good
2,ChatGPT is at capacity right now \nWrite a lim...,good
3,"Teachers friends, if you want to see how much ...",good
4,Remove the last sentence and I'm pretty happy ...,good


In [5]:
test_reviews.head()

Unnamed: 0,review,labels
0,#ChatGPT for cooking inspiration https://t.co/...,good
1,#OpinionampAnalysis #Education #Students Opini...,good
2,"A simple example, but still impressive. https:...",good
3,#ChatGPT is mind blowing 🤯 #OpenAI wonder it’s...,good
4,I asked #ChatGPT to write code that transpiles...,good


In [6]:
# Unique classes
set(train_reviews.labels)


{'bad', 'good'}

In [7]:
# Splitting train by rating
# Create dict
by_rating = collections.defaultdict(list)  

for _, row in train_reviews.iterrows():
    by_rating[row.labels].append(row.to_dict())

# Output will be like below:
#  {
#   1 : [{'rating': 1, 'review': 'Just ...'}, {'rating': 1, 'review': 'But ...'}], 
#   2 : [{'rating': 2, 'review': 'How ...'}, {'rating': 2, 'review': 'Here ...'}]
#  }

In [8]:
by_rating

defaultdict(list,
            {'good': [{'review': 'ChatGPT use case - breaking through writers or creative blocks. \\n\\nI can see this being useful to kick around ideas and form new thought patterns to your own original ideas. \\n\\nSo AI that augments our thought process to a degree, not replacing it...',
               'labels': 'good'},
              {'review': "Sod it I'll put it here too, it's just that Good! I might make this little app public if there's interest. Thank you @OpenAI for the awesome tech!\\n\\n#OpenAI #ChatGPT #gpt3chat #chatgpt3 #OpenAIChat #OpenAIChatGPT https://t.co/sz9wM4nVqO",
               'labels': 'good'},
              {'review': "ChatGPT is at capacity right now \\nWrite a limerick about the status of ChatGPT\\nChatGPT is surely the best\\nBut its servers are put to the test\\nWith so many users chatting\\nIt's no wonder they're lagging\\nBut they'll fix it soon, no need to fret!\\n#aweSTEM @realJ_Mitchell @ZackTeitel",
               'labels': 'good'}

In [9]:
# Create split data
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):

    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'

    # Add to final list
    final_list.extend(item_list) # use extend when adding multiple elements to a list; use append for additing one element.

In [10]:
for _, row in test_reviews.iterrows():
    row_dict = row.to_dict()
    row_dict['split'] = 'test'
    final_list.append(row_dict)

In [11]:
# Write split data to file
final_reviews = pd.DataFrame(final_list)

In [12]:
final_reviews.split.value_counts()

train    19632
test     11953
val       8413
Name: split, dtype: int64

In [13]:
final_reviews.review.head()

0    So... can #ChatGPT listen to a Twitter space a...
1    A poem about @elonmusk by ChatGPT https://t.co...
2    Asking ChatGpt about the future of cryptocurre...
3    ChatGPT: OpenAI’s New Chatbot Takes the Intern...
4                                     #ChatGPT unreal.
Name: review, dtype: object

In [14]:
final_reviews[pd.isnull(final_reviews.review)]

Unnamed: 0,review,labels,split


In [15]:
# Preprocess the reviews
def preprocess_text(text):
    if type(text) == float:
        print(text)
    text = text.lower()
#     text = re.sub(r"([.,!?])", r" \1 ", text)  # E.g., convert "end." to "end . "
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text) # replace special characters with empty string
    text = re.sub(r'[^\w\s]+', '', text)
    text = emoji.demojize(text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'http\S+', '', text, flags=re.MULTILINE)
    return text
    
final_reviews.review = final_reviews.review.apply(preprocess_text)

In [16]:
# final_reviews['rating'] = final_reviews.rating.apply({0: 'prochoice', 1: 'prolife'}.get)

In [17]:
final_reviews.head()

Unnamed: 0,review,labels,split
0,so can chatgpt listen to a twitter space an...,bad,train
1,a poem about elonmusk by chatgpt t co yzw g he,bad,train
2,asking chatgpt about the future of cryptocurre...,bad,train
3,chatgpt openai s new chatbot takes the interne...,bad,train
4,chatgpt unreal,bad,train


In [18]:
final_reviews.to_csv(args.output_munged_csv, index=False)  # don't write row names (index).

In [19]:
final_reviews

Unnamed: 0,review,labels,split
0,so can chatgpt listen to a twitter space an...,bad,train
1,a poem about elonmusk by chatgpt t co yzw g he,bad,train
2,asking chatgpt about the future of cryptocurre...,bad,train
3,chatgpt openai s new chatbot takes the interne...,bad,train
4,chatgpt unreal,bad,train
...,...,...,...
39995,huge alpha leak ape in when chatgpt launches t...,bad,test
39996,i asked chatgpt to write a poem about molecula...,bad,test
39997,on openai chatgpt i got an answer in seconds ...,bad,test
39998,chatgpt is an incredible tool but when asked...,bad,test
