### Filter & Change values

In [1]:
import pandas as pd

df = pd.read_csv('../datasets/cleaned/Reviews.csv', sep = ',')
new_columns = {'Id': 'id', 'Score': 'score', 'Summary': 'title'}
df = df[['Id', 'Score', 'Summary']]
df.rename(columns = new_columns, inplace = True)
display(df.head(2))

Unnamed: 0,id,score,title
0,1,5,Good Quality Dog Food
1,2,1,Not as Advertised


##### Set values to boolean

In [2]:
df = df[~df['score'].isin([2, 3, 4])]
df['is_positive'] = df['score'].apply(lambda x: True if x == 5 else False)
df = df[['id', 'is_positive', 'title']]
display(df.head(2))

Unnamed: 0,id,is_positive,title
0,1,True,Good Quality Dog Food
1,2,False,Not as Advertised


### Data shrink

In [3]:
frac_shape = df.shape[0]
print("Full length before: " + str(df.shape[0])) # Init shape

Full length before: 415390


In [4]:
df_positive = df.query('is_positive == True')
df_negative = df.query('is_positive == False')
df_positive = df_positive.sample(frac = df_negative.shape[0] / frac_shape)
# df_positive = df_positive.sample(frac = (df_negative.shape[0] / frac_shape) * 1.8)
df = pd.concat([df_positive, df_negative])

In [5]:
print(f" Positive length: {df_positive.shape[0]}\n Negative length: {df_negative.shape[0]}\n Full length: {df.shape[0]}")

 Positive length: 45691
 Negative length: 52268
 Full length: 97959


#### Left only engilish titles

In [6]:
import langid

to_delete = []
df.reset_index(drop=True, inplace=True)

for index, title_value in enumerate(df['title']):
    title_state = langid.classify(title_value)
    if title_state[0] != 'en':
        to_delete.append(index)

df.drop(index = to_delete, inplace = True)

In [7]:
print("Full length after: " + str(df.shape[0])) # Shape after language filtering

Full length after: 81082


#### Save transformed data

In [8]:
output_path = '../datasets/transformed/Reviews.csv'
df[['is_positive', 'title']].to_csv(path_or_buf = output_path, sep = ',')