# **Test 080424**

In [24]:
import pandas as pd
import json
import csv
import os
import random

In [25]:

def read_and_sample_data(file_path, sample_size):
    with open(file_path, "r") as f:
        lines = f.readlines()
    
    sampled_lines = random.sample(lines, min(sample_size, len(lines)))
    
    sampled_data = []
    for line in sampled_lines:
        try:
            str_text = line.replace("true", "True").replace("false", "False")
            raw_sample = eval(str_text)
            sampled_data.append(raw_sample)
        except:
            pass  # Ignore lines that raise errors
    
    return sampled_data

def read_txt_file_to_list(file_path):
    records = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            records.append(line.strip())
    return records


def remove_duplicate_rows(df):
    df = df.drop_duplicates(subset=['reviewText', 'asin', 'reviewerID'])
    return df

def remove_duplicate_rows_ver2(df):
    df = df.drop_duplicates(subset=['reviewText', 'itemID', 'reviewerID'])
    return df

In [26]:
file_number = 0
dataset_name = "Toys_and_Games_5"  
json_file_path = "./data/" + dataset_name + ".json"  
output_directory = "data/"  
small_dataset = "Small_"+ dataset_name  + ".json"
chunk_size = 10000  

extracted_data = output_directory + "extract/" + dataset_name + "_extracted_data_" + str(file_number) + ".csv"
filtered_data = output_directory + "filtered/" + dataset_name + "_filtered_data_" + str(file_number) + ".csv"
outliner_data = output_directory + "outliner/" + dataset_name + "_outliner_data_" + str(file_number) + ".csv"

final_json_name = dataset_name + "_Filtered" + ".json"


In [None]:
sample_size = chunk_size  
sampled_data = read_and_sample_data(json_file_path, sample_size)
data_df = pd.DataFrame(sampled_data)
print(data_df)

output_file_path = os.path.join(output_directory, f"Small_{dataset_name}.json")
data_df.to_json(output_file_path, orient='records', lines=True)

print(f"Saved {len(data_df)} records to {output_file_path}")


In [None]:
data = pd.read_json(output_directory + small_dataset, lines=True)
print(data)
data_df = pd.DataFrame(data, columns=['reviewerID', 'asin', 'overall', 'reviewText'])
print(data_df.head())
data_df.columns = ['reviewerID', 'asin', 'overall', 'reviewText']

data_rating = data_df["overall"].tolist()
data_review = data_df["reviewText"].tolist()
data_reviewerID = data_df["reviewerID"].tolist()
data_itemID = data_df["asin"].tolist()

In [None]:
from sklearn.model_selection import train_test_split

# Get the number of unique users and items
num_users = len(data_df['reviewerID'].unique())
num_items = len(data_df['asin'].unique())

# Display the counts
print(f"Number of users: {num_users}")
print(f"Number of items: {num_items}")

# Split the dataset into train, validation, and test sets
train_size = 0.7
valid_size = 0.1
test_size = 0.2

# First, split into train and temp (valid + test)
train_data, temp_data = train_test_split(data_df, test_size=(valid_size + test_size), random_state=42)

# Then split temp_data into validation and test
valid_data, test_data = train_test_split(temp_data, test_size=test_size / (valid_size + test_size), random_state=42)

# Get sizes of the datasets
size_train = len(train_data)
size_valid = len(valid_data)
size_test = len(test_data)

# Display the sizes
print(f"Size of train set: {size_train}")
print(f"Size of validation set: {size_valid}")
print(f"Size of test set: {size_test}")


In [None]:
print(len(data_rating))
print(len(data_reviewerID))

In [None]:
import pandas as pd

def defineRating(data_rating):
  T_v = []
  for rating in data_rating:
    if rating >= 4:
      T_v.append(1)
    else:
      T_v.append(-1)
  return T_v

T_v = defineRating(data_rating)
data = {'id': range(len(T_v)), 'T_v': T_v}
df_rating = pd.DataFrame(data)

# Print the dataframe
print(df_rating)


In [None]:

df_rating['reviewerID'] = data_reviewerID
df_rating['asin'] = data_itemID
df_rating['overall'] = data_rating
df_rating['reviewText'] = data_review
print(df_rating)

# SODCM

# BERT

In [None]:
import nltk
import spacy
import re
import contractions
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from flair.data import Sentence
from flair.models import TextClassifier
from transformers import pipeline

nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('punkt_tab')

tagger = TextClassifier.load('en-sentiment')
nlp = spacy.load("en_core_web_sm")
sia = SentimentIntensityAnalyzer()
stop_words = set(stopwords.words('english'))

pos_words = read_txt_file_to_list(output_directory + "positive-words.txt")
neg_words = read_txt_file_to_list(output_directory + "negative-words.txt")
print(pos_words)
print(neg_words)

word_type = ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
brands = ['dove', 'aveda', 'silver']
keywords = ['overall', 'summary']

def handle_contractions(text):
    return contractions.fix(text)

def get_sentiment_words(text):
    text = handle_contractions(text)
    words = word_tokenize(text)
    sentiment_words = [word for word in words if word.lower() not in stop_words and sia.polarity_scores(word)['compound'] != 0]
    return sentiment_words

def sentiment_analysis_flair(sentence):
    if not sentence.strip():
        return 0.0
    
    flair_sentence = Sentence(sentence)
    tagger.predict(flair_sentence)
    
    sentiment_value = flair_sentence.labels[0].value
    sentiment_score = flair_sentence.labels[0].score
    bias = 1.75 if any(keyword.lower() in sentence.lower() for keyword in keywords) else 1.0
    
    return sentiment_score * bias if sentiment_value == 'POSITIVE' else -sentiment_score * bias

def check_string(input_value):
    return input_value if isinstance(input_value, str) else ''

def separate_sentences(text):
    text = check_string(text)
    text = handle_contractions(text)  # Expand contractions before splitting
    doc = nlp(text)
    sentences = []
    current_sentence = ""

    conjunctions = ["but", "and", "or", "although", "however", "because", "therefore", "meanwhile", 
                    "moreover", "nevertheless", "nonetheless", "otherwise", "thus", "yet",
                    "which", "who", "whose", "whom", "where", "when", "why", "that"]
    
    for token in doc:
        if token.is_sent_start or token.text == "\n":
            if current_sentence:
                sentences.append(current_sentence.strip())
            current_sentence = token.text
        elif token.text.lower() in conjunctions:
            if any(brand.lower() in current_sentence.lower() for brand in brands):
                current_sentence += " " + token.text
            elif current_sentence:
                sentences.append(current_sentence.strip())
                current_sentence = token.text
        else:
            current_sentence += " " + token.text

    # Check for the last sentence
    if current_sentence:
        sentences.append(current_sentence.strip())
    
    # Post-process to ensure sentences are meaningful
    processed_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        if sentence.endswith(tuple(conjunctions)) and i < len(sentences) - 1:
            # Merge with the next sentence if the sentence ends with a conjunction
            sentence += " " + sentences[i + 1]
            processed_sentences.append(sentence.strip())
        elif sentence:
            processed_sentences.append(sentence.strip())
    
    return processed_sentences

# Sentiment analyzer dùng transformer
sentiment_analyzer = pipeline("sentiment-analysis")

def split_sentences_and_filter_sentiment(text):
    sentences = separate_sentences(text)
    if len(sentences) == 1 and not re.search(r'[.!?]', text):
        sentences = [text]

    filtered_sentences = []

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue

        words = word_tokenize(sentence)
        contains_sentiment = (any(sia.polarity_scores(word)['compound'] != 0 for word in words) or
                              any(word.lower() in pos_words or word.lower() in neg_words for word in words))

        sentence_flair = Sentence(sentence)
        tagger.predict(sentence_flair)
        is_a = any(
            label.value in word_type for token in sentence_flair.tokens for label in token.get_labels('pos')
        )

        if any(word.lower() in brands for word in words):
            contains_sentiment = False
            is_a = False

        if contains_sentiment or is_a:
            filtered_sentences.append(sentence)
        # else:
        #     print("remove: ", sentence)

    if not filtered_sentences:
        filtered_sentences = sentences

    return filtered_sentences



# Processing Data

In [None]:

predictRating = []
text = "Lavender water. This has a nice scent and the sprayer sprays well, but it is so weak that the scent disappears within a few minutes If I spray more, the sheets get damp. I received a free bottle in exchange for an honest review and I was pretty excited about having lavender-scented sheets, but I won't be buying any of this item and can't recommend it. I have tried this company's soaps, though, and they are wonderfult"

sentences = split_sentences_and_filter_sentiment(text)
print(sentences)
total_sentiment = 0
for sentence in sentences:
    # sentence = remove_stopwords(sentence)
    score = sentiment_analysis_flair(sentence)
    print(sentence, score)
    total_sentiment += score
    # print(total_sentiment)
print(total_sentiment)
if total_sentiment/len(sentences) < -0.2:
    predictRating.append(-1)
    print("Result: -1")
else:
    predictRating.append(1)
    print("Result: 1")

In [None]:
import tqdm
import os

predictRating = []
def process_and_save(df_rating, output_file):
    output_df = pd.DataFrame(columns=['reviewerID', 'asin', 'predictRating', 'T_v', 'overall', 'reviewText', 'filteredReviewText'])
    score = 0
    for i, row in tqdm.tqdm(df_rating.iterrows(), desc="Processing", total=len(df_rating)):
        sentences = split_sentences_and_filter_sentiment(row['reviewText'])
        if not sentences:
            print(f"Warning: No sentences after filtering for review ID {row['reviewerID']} and asin {row['asin']}")

        total_sentiment = sum(sentiment_analysis_flair(sentence) for sentence in sentences)
        if total_sentiment >= -0.2:
            score = 1
        else:
            score = -1
        predictRating.append(score)
        output_df.loc[len(output_df)] = { 'reviewerID': row['reviewerID'],
                                          'asin': row['asin'],
                                          'predictRating': score,
                                          'T_v': row['T_v'],
                                          'overall': row['overall'],
                                          'reviewText': row['reviewText'],
                                          'filteredReviewText': " ".join(sentences) }
        
        if (i + 1) % 1000 == 0:
            output_df.to_csv(output_file, mode='a', header=not os.path.exists(output_file), index=False)
            output_df = pd.DataFrame(columns=['reviewerID', 'asin', 'predictRating', 'T_v', 'overall', 'reviewText', 'filteredReviewText'])
    
    if not output_df.empty:
        output_df.to_csv(output_file, mode='a', header=not os.path.exists(output_file), index=False)

process_and_save(df_rating, extracted_data)


In [None]:
output = pd.read_csv(extracted_data)
output_df = pd.DataFrame(output)
print("Length Original Dataset: ", len(output_df))

outliner_rows = []
filtered_rows = []

for i, row in output_df.iterrows():
    row['overall_new'] = row['overall']
    
    if row['predictRating'] != row['T_v']:
        current_asin = row['asin']
        median_value = output_df[output_df['asin'] == current_asin]['overall'].quantile(0.5) 
        
        row['overall_new'] = float(round((row['overall'] + median_value) / 2))
        # print(row['overall_new'])
        print(row)
        outliner_rows.append(row)
    filtered_rows.append(row)

outliner = pd.DataFrame(outliner_rows)
filteredRecords = pd.DataFrame(filtered_rows)

outliner.to_csv(outliner_data, index=False, float_format='%.6f')
filteredRecords.to_csv(filtered_data, index=False, float_format='%.6f')


In [None]:
import pandas as pd
from tqdm import tqdm
import string

stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

def remove_stopwords_and_punctuation(text):
    # Kiểm tra nếu text là None hoặc NaN
    if pd.isnull(text):
        return "" 
    text = text.replace("\n", " ").replace("\\/", " ").replace("\\\"", " ").replace("`","").replace("\"","")
    return text

# Load json data to dataframe
raw_data = pd.read_json(output_directory + small_dataset, lines=True)
print("Dataset: ", len(raw_data))

tqdm.pandas() 
raw_data['filteredReviewText'] = raw_data['reviewText'].progress_apply(remove_stopwords_and_punctuation)
raw_data['combined_key'] = raw_data['reviewerID'].astype(str) + raw_data['asin'].astype(str) + raw_data['reviewText'].astype(str)
filteredRecords['combined_key'] = filteredRecords['reviewerID'].astype(str) + filteredRecords['asin'].astype(str) + filteredRecords['reviewText'].astype(str)

raw_data['filteredReviewText'] = None
raw_data['overall_new'] = raw_data['overall']

for index, row in tqdm(raw_data.iterrows(), total=len(raw_data)):
    combined_key = row['combined_key']
    
    matching_row = filteredRecords[filteredRecords['combined_key'] == combined_key]   
    if not matching_row.empty:
        raw_data.at[index, 'filteredReviewText'] = matching_row['filteredReviewText'].values[0]
        raw_data.at[index, 'overall_new'] = matching_row['overall_new'].values[0]

raw_data = raw_data.drop(columns=['combined_key'])
filteredRecords = filteredRecords.drop(columns=['combined_key'])

raw_data.to_json(output_directory + small_dataset, orient='records', lines=True)
data_filtered = pd.read_json(output_directory + small_dataset, lines=True)

print("Final dataset length: ", len(data_filtered))


In [None]:
import os
import glob
import shutil

def backup_and_delete_files(folder_path, backup_path, backup_folder_name, date, extensions=[".csv"]):
    backup_folder_path = os.path.join(backup_path, backup_folder_name + "_" + date)
    print(f"Thư mục sao lưu: {backup_folder_path}")
    
    if not os.path.exists(backup_folder_path):
        print(f"Tạo thư mục sao lưu mới: {backup_folder_path}")
        os.makedirs(backup_folder_path)
    
    for ext in extensions:
        files = glob.glob(os.path.join(folder_path, f"*{ext}"))
        for file_path in files:
            try:
                shutil.copy(file_path, backup_folder_path)
                print(f"Đã sao chép: {file_path} tới {backup_folder_path}")
                os.remove(file_path)
                print(f"Đã xóa: {file_path}")
            except Exception as e:
                print(f"Không thể sao chép hoặc xóa {file_path}. Lỗi: {e}")

# Ví dụ sử dụng:
backup_and_delete_files("feature", "backup", "BKfeature", "170824", extensions=[".csv"])
backup_and_delete_files("feature_originalmethod", "backup", "BKfeature_originalmethod", "170824", extensions=[".csv"])
backup_and_delete_files("data", "backup", "BKdata", "170824", extensions=[".csv"])
backup_and_delete_files("chkpt", "backup", "BK_chkpt", "170824", extensions=[".pt", ".pkl", "npz"])
backup_and_delete_files("output", "backup", "BK_output", "170824", extensions=[".model"])
