In [None]:
import pandas as pd
from string import ascii_lowercase

In [3]:
df = pd.read_csv('../generated_data/chat_100.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,type of prompt,temperature,text
0,0,positive,0.1,"Alright, self, let's have a little chat, eh?..."
1,1,positive,0.1,"Man, I can't believe it's been six months al..."
2,2,positive,0.1,"Man, I can't believe it's been six months al..."
3,3,positive,0.1,"Man, I can't believe it's been six months si..."
4,4,positive,0.1,"Alright, self, let's have a little chat, sha..."


In [5]:
df.dtypes

Unnamed: 0          int64
type of prompt     object
temperature       float64
text               object
dtype: object

In [None]:
df['text'].str.islower

0        Alright, self, let's have a little chat, eh?...
1        Man, I can't believe it's been six months al...
2        Man, I can't believe it's been six months al...
3        Man, I can't believe it's been six months si...
4        Alright, self, let's have a little chat, sha...
                             ...                        
595      Sure, I can do this. I mean, I'm a data scie...
596      God. I mean, seriously - when did everything...
597      Man, I don't even know where to begin. Londo...
598      Man, I don't know if I can hack it here anym...
599      Man, I don't know what's wrong with me. I'm ...
Name: text, Length: 600, dtype: object

In [10]:
sum(df['text'].str.startswith('('))

6

In [32]:
import numpy as np

In [34]:
df['text'].str.len().min()

np.int64(1)

In [14]:
non_ascii_mask = df['text'].apply(lambda x: not all(ord(char) < 128 for char in str(x)))

In [38]:
# clean data
def clean_df(df: pd.DataFrame) -> pd.DataFrame:

    # make text lowercase
    df['text_lower'] = df['text'].str.lower()

    # drop if text is shorter than 500 characters
    min_length = df['text_lower'].str.len() >= 500
    # check if text starts with parenthesis
    parenthesis = df['text_lower'].str.startswith('(')
    
    # check if non ascii letters are used
    non_ascii_mask = df['text'].apply(lambda x: not all(ord(char) < 128 for char in str(x)))

    df_cleaned = df[(~parenthesis) & (~non_ascii_mask) & (min_length)]

    return df_cleaned


In [22]:
cleaned_100 = clean_df(df)

In [25]:
def drop_data(df:pd.DataFrame, temp:int = 1.3) -> pd.DataFrame:

    return df[df['temperature'] != temp]

In [27]:
cleaned_medium_temp = drop_data(cleaned_100, temp = 1.3)

In [28]:
cleaned_medium_temp.tail()

Unnamed: 0.1,Unnamed: 0,type of prompt,temperature,text,text_lower
495,495,negative,0.7,"Bloody hell, here I am, a 25-year-old data s...","bloody hell, here i am, a 25-year-old data s..."
496,496,negative,0.7,"London, you're a beast of a city, aren't you...","london, you're a beast of a city, aren't you..."
497,497,negative,0.7,"London, bloody London. A city of millions, a...","london, bloody london. a city of millions, a..."
498,498,negative,0.7,"London, man, it's a beast. I thought I could...","london, man, it's a beast. i thought i could..."
499,499,negative,0.7,"God, I don't even know where to begin. I mea...","god, i don't even know where to begin. i mea..."


In [29]:
cleaned_medium_temp.to_csv('../generated_data/cleaned_medium_temp.csv')

In [36]:
dropped = drop_data(df)

In [40]:
cleaned_final = clean_df(dropped)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_lower'] = df['text'].str.lower()


In [43]:
def prepare_csv(df:pd.DataFrame, col_drop: list = ['Unnamed: 0', 'text_lower']) -> pd.DataFrame:

    # prepare data for export in csv by dropping the columns specified in the argument

    return df.drop(columns=col_drop, axis = 1)

In [45]:
export = prepare_csv(cleaned_final)

In [46]:
export.to_csv('../generated_data/cleaned/final_100.csv')

In [50]:
def merge_df(df1: pd.DataFrame, df2: pd.DataFrame):

    return pd.concat([df1, df2], axis=0)

In [51]:
df_ant = pd.read_csv('../generated_data/data_chat_local.csv')

In [55]:
merged=merge_df(export, df_ant).drop('Unnamed: 0', axis=1)

In [56]:
merged.to_csv('../generated_data/cleaned/merged.csv')

In [58]:
new_observations = pd.read_csv('../generated_data/data_chat_local_complete.csv')

In [61]:
cleaned_new = clean_df(new_observations)

In [62]:
final_df = merge_df(merged, cleaned_new)

In [65]:
prepare_csv(final_df).to_csv('../generated_data/cleaned/final_complete.csv')