In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import pickle
import re
import numpy as np
from scipy import stats


In [None]:
def extract_ranking(text):
    if pd.isna(text) or not isinstance(text, str):
        return None
    text = text.replace(',', '')
    match = re.search(r'#([\d,]+)', text)
    if match:
        return int(match.group(1))
    return None

def extract_purchase_count(text):
    if pd.isna(text) or not isinstance(text, str) or text.strip() == '':
        return 0
    match = re.match(r'(\d+)([Kk]?)\+', text)
    if match:
        number = int(match.group(1))
        if match.group(2).lower() == 'k':
            number *= 1000
        return number
    return 0

def clean_rating(text):
    if pd.isna(text) or not isinstance(text, str):
        return None
    match = re.match(r'^(\d+(\.\d)?|\.\d)$', text.strip())
    if match:
        return float(text)
    return None

def clean_review_count(text):
    if pd.isna(text) or not isinstance(text, str):
        return None
    # Check if the text is a valid integer or a string with commas
    if text.isdigit():  # Check if the text is a valid integer
        return int(text)
    else:  # Handle strings with commas
        # Remove commas from the text before converting to integer
        text = text.replace(',', '')
        try:
            return int(text)
        except ValueError:
            return None

input_file_path = 'C:\\Users\\kongj\\Desktop\\Trendiness Model\\amazon_data.csv'
output_file_path = 'C:\\Users\\kongj\\Desktop\\Trendiness Model\\trendiness_data_cleaned.csv'

df = pd.read_csv(input_file_path)
df['rating'] = df['rating'].apply(clean_rating)
df['review_count'] = df['review_count'].apply(clean_review_count)
df['rankings'] = df['rankings'].apply(extract_ranking)
df['purchase_cnt_prev_month'] = df['purchase_cnt_prev_month'].apply(extract_purchase_count)
df.dropna(inplace=True, subset=['review_count', 'rankings', 'rating', 'purchase_cnt_prev_month'])


df_cleaned = df[['rating', 'review_count', 'rankings', 'purchase_cnt_prev_month']]
df_cleaned.to_csv(output_file_path, index=False)
df.to_csv('C:\\Users\\kongj\\Desktop\\Trendiness Model\\amazon_cleaned.csv')

In [None]:

data = pd.read_csv('trendiness_data_cleaned.csv')
scaler = MinMaxScaler()
data[['purchase_cnt_prev_month', 'review_count', 'rating']] = scaler.fit_transform(data[['purchase_cnt_prev_month', 'review_count', 'rating']])
data['rankings'] = 1 - (data['rankings'] / data['rankings'].max())

data['trendiness'] = (
    0.35 * data['purchase_cnt_prev_month'] +
    0.25 * data['rankings'] +
    0.2 * data['rating'] +
    0.2 * data['review_count']
)


data['uniqueness'] = (
    1 * data['rating'] - 
    0.4 * data['purchase_cnt_prev_month'] - 
    0.3 * data['rankings'] - 
    0.3 * data['review_count']
)

uniqueness_scaler = MinMaxScaler()
data['uniqueness'] = uniqueness_scaler.fit_transform(data[['uniqueness']])

data.to_csv('labeled_trendiness_uniqueness_data.csv', index=False)

In [None]:
class TrendinessUniquenessDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe[['rating', 'review_count', 'rankings', 'purchase_cnt_prev_month']].values
        self.trendiness_labels = dataframe['trendiness'].values
        self.uniqueness_labels = dataframe['uniqueness'].values

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return (torch.tensor(self.data[idx], dtype=torch.float32), 
                torch.tensor(self.trendiness_labels[idx], dtype=torch.float32),
                torch.tensor(self.uniqueness_labels[idx], dtype=torch.float32))

In [None]:
class TrendinessModel(nn.Module):
    def __init__(self):
        super(TrendinessModel, self).__init__()
        self.fc1 = nn.Linear(4, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3_trendiness = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        trendiness = self.sigmoid(self.fc3_trendiness(x))
        return trendiness

In [None]:
class UniquenessModel(nn.Module):
    def __init__(self):
        super(UniquenessModel, self).__init__()
        self.fc1 = nn.Linear(4, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3_uniqueness = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        uniqueness = self.sigmoid(self.fc3_uniqueness(x))
        return uniqueness

In [None]:
dataset = TrendinessUniquenessDataset(data)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
trendiness_model = TrendinessModel()
optimizer = optim.Adam(trendiness_model.parameters(), lr=0.001)
criterion = nn.MSELoss()

num_epochs = 10
for epoch in range(num_epochs):
    trendiness_model.train()
    train_loss = 0
    for inputs, trendiness_labels, uniqueness_labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        optimizer.zero_grad()
        trendiness_outputs = trendiness_model(inputs)
        loss_trendiness = criterion(trendiness_outputs.squeeze(), trendiness_labels)
        loss_trendiness.backward()
        optimizer.step()
        train_loss += loss_trendiness.item()
    train_loss /= len(train_loader)
    print(f'Epoch {epoch+1}, Training Loss: {train_loss}')

with open('trendiness_model.pkl', 'wb') as f:
    pickle.dump(trendiness_model.state_dict(), f)

In [None]:
uniqueness_model = UniquenessModel()
optimizer = optim.Adam(uniqueness_model.parameters(), lr=0.001)
criterion = nn.MSELoss()

num_epochs = 10
for epoch in range(num_epochs):
    uniqueness_model.train()
    train_loss = 0
    for inputs, trendiness_labels, uniqueness_labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        optimizer.zero_grad()
        uniqueness_outputs = uniqueness_model(inputs)
        loss_uniqueness = criterion(uniqueness_outputs.squeeze(), uniqueness_labels)
        loss_uniqueness.backward()
        optimizer.step()
        train_loss += loss_uniqueness.item()
    train_loss /= len(train_loader)
    print(f'Epoch {epoch+1}, Training Loss: {train_loss}')

with open('uniqueness_model.pkl', 'wb') as f:
    pickle.dump(uniqueness_model.state_dict(), f)

In [None]:
def predict_trendiness_uniqueness(new_data, trendiness_model, uniqueness_model, scaler):
    new_data[['purchase_cnt_prev_month', 'review_count', 'rating']] = scaler.transform(new_data[['purchase_cnt_prev_month', 'review_count', 'rating']])
    new_data['rankings'] = 1 - (new_data['rankings'] / new_data['rankings'].max())
    inputs = torch.tensor(new_data[['rating', 'review_count', 'rankings', 'purchase_cnt_prev_month']].astype('float32').values)
    
    trendiness_model.eval()
    uniqueness_model.eval()
    
    with torch.no_grad():
        trendiness_outputs = trendiness_model(inputs)
        uniqueness_outputs = uniqueness_model(inputs)
    
    trendiness_scores = trendiness_outputs.squeeze().numpy()
    uniqueness_scores = uniqueness_outputs.squeeze().numpy()
    
    return trendiness_scores, uniqueness_scores

loaded_trendiness_model = TrendinessModel()
loaded_uniqueness_model = UniquenessModel()

with open('trendiness_model.pkl', 'rb') as f:
    loaded_trendiness_model.load_state_dict(pickle.load(f))

with open('uniqueness_model.pkl', 'rb') as f:
    loaded_uniqueness_model.load_state_dict(pickle.load(f))


In [None]:
loaded_trendiness_model = TrendinessModel()
loaded_uniqueness_model = UniquenessModel()

with open('trendiness_model.pkl', 'rb') as f:
    loaded_trendiness_model.load_state_dict(pickle.load(f))

with open('uniqueness_model.pkl', 'rb') as f:
    loaded_uniqueness_model.load_state_dict(pickle.load(f))

amazon_df = pd.read_csv('amazon_cleaned.csv')

trendiness_scores = []
uniqueness_scores = []

for index, row in amazon_df.iterrows():
    new_data_point = row.to_frame().T
    trendiness_score, uniqueness_score = predict_trendiness_uniqueness(new_data_point, loaded_trendiness_model, loaded_uniqueness_model, scaler)
    trendiness_scores.append(trendiness_score)
    uniqueness_scores.append(uniqueness_score)

amazon_df['predicted_trendiness'] = trendiness_scores
amazon_df['predicted_uniqueness'] = uniqueness_scores

amazon_df['predicted_trendiness'] = amazon_df['predicted_trendiness'].astype(float)
amazon_df['predicted_uniqueness'] = amazon_df['predicted_uniqueness'].astype(float)

amazon_df['trendiness_zscore'] = np.abs(stats.zscore(amazon_df['predicted_trendiness']))
amazon_df['uniqueness_zscore'] = np.abs(stats.zscore(amazon_df['predicted_uniqueness']))

threshold = 3
outliers = amazon_df[(amazon_df['trendiness_zscore'] > threshold) | (amazon_df['uniqueness_zscore'] > threshold)]

amazon_df = amazon_df[~((amazon_df['trendiness_zscore'] > threshold) | (amazon_df['uniqueness_zscore'] > threshold))]

scaler_trendiness = MinMaxScaler()
scaler_uniqueness = MinMaxScaler()

amazon_df[['predicted_trendiness']] = scaler_trendiness.fit_transform(amazon_df[['predicted_trendiness']])
amazon_df[['predicted_uniqueness']] = scaler_uniqueness.fit_transform(amazon_df[['predicted_uniqueness']])

output_file_path = 'C:\\Users\\kongj\\Desktop\\Trendiness Model\\amazon_predicted_scores.csv'
amazon_df.to_csv(output_file_path, index=False)
print(f"Data without outliers and normalized saved to {output_file_path}")