# Preparation

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Read Data

In [9]:
import pandas as pd

# Read data
base_data_path = './drive/MyDrive/Studium_Kempten/Masterarbeit/Azure_Notebooks/Abgabe_Data'
data = pd.read_csv(f'{base_data_path}/amazon_ffr.csv')

In [10]:
data.shape

(568454, 10)

# Preprocess Data

In [11]:
# Select relevant features (drop all except Score and Text)
data.drop(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'], axis=1, inplace=True)

# Remove null values
data.dropna(inplace=True)

# Map Score to Sentiment
def map_score_to_sentiment(score):
  match score:
    case 1 | 2: # Negaitve
      return 0
    case 3: # Neutral
      return 1
    case 4 | 5: # Positive
      return 2

data['Sentiment'] = data['Score'].apply(map_score_to_sentiment)
data.drop(['Score'], axis=1, inplace=True)

In [5]:
data

Unnamed: 0,Text,Sentiment
0,I have bought several of the Vitality canned d...,2
1,Product arrived labeled as Jumbo Salted Peanut...,0
2,This is a confection that has been around a fe...,2
3,If you are looking for the secret ingredient i...,0
4,Great taffy at a great price. There was a wid...,2
...,...,...
568449,Great for sesame chicken..this is a good if no...,2
568450,I'm disappointed with the flavor. The chocolat...,0
568451,"These stars are small, so you can give 10-15 o...",2
568452,These are the BEST treats for training and rew...,2


# Train-Test-Generalization Split

In [6]:
from sklearn.model_selection import train_test_split
train_df, temp_df = train_test_split(data, test_size=0.3, random_state=11084)
test_df, generalization_df = train_test_split(temp_df, test_size=10000, train_size=10000, random_state=42)

# Save separated data
train_df.to_csv(f'{base_data_path}/amazon_ffr_train.csv', index=False)
test_df.to_csv(f'{base_data_path}/amazon_ffr_test.csv', index=False)
generalization_df.to_csv(f'{base_data_path}/amazon_ffr_generalization.csv', index=False)

In [7]:
print(f'Train Shape: {train_df.shape}')
print(f'Test Shape: {test_df.shape}')
print(f'Gerneralization Shape: {generalization_df.shape}')

Train Shape: (397917, 2)
Test Shape: (10000, 2)
Gerneralization Shape: (10000, 2)
