In [None]:
import requests
from bs4 import BeautifulSoup
import re
import time
import csv

base_url = "https://www.technodom.kz"
start_path = "/catalog/smartfony-i-gadzhety/smartfony-i-telefony/smartfony"
max_pages = 50  # You can change this to a higher or lower number

all_products = []

for page_number in range(1, max_pages + 1):
    current_url = f"{base_url}{start_path}?page={page_number}"
    print(f"\nüîÑ Parsing page {page_number}: {current_url}")

    response = requests.get(current_url)
    if response.status_code != 200:
        print(f"‚ö†Ô∏è Error {response.status_code} while requesting the page.")
        break

    soup = BeautifulSoup(response.text, "html.parser")

    products = soup.find_all('a', class_=re.compile(r'ProductItem_itemLink.*'))
    if not products:
        print("‚ùå No products found, stopping.")
        break

    for idx, product in enumerate(products, 1):
        product_title_tag = product.find('p', {'data-testid': 'product-title'})
        title = product_title_tag.text.strip() if product_title_tag else "Title not found"

        link = product.get('href')
        full_link = f"{base_url}{link}"

        print(f"\nüì± Product: {title}")
        print(f"üîó Link: {full_link}")

        product_reviews = []

        try:
            product_page_response = requests.get(full_link)
            if product_page_response.status_code != 200:
                print(f"‚ö†Ô∏è Error {product_page_response.status_code} while loading product page.")
                continue

            product_soup = BeautifulSoup(product_page_response.text, "html.parser")
            reviews = product_soup.find_all('div', class_=re.compile(r'Review_block.*'))

            if reviews:
                print("üí¨ Reviews:")
                for review in reviews:
                    review_comment = review.find('span', class_='Review_textSummary__fz08A')
                    review_text = review_comment.text.strip() if review_comment else 'No review text.'
                    print(f" - Review: {review_text}")
                    product_reviews.append(review_text)
            else:
                print("‚ùå No reviews found.")

        except Exception as e:
            print(f"‚ö†Ô∏è Error while processing the product: {e}")

        all_products.append({
            "Title": title,
            "Link": full_link,
            "Reviews": " ||| ".join(product_reviews) if product_reviews else "No reviews"
        })

        print("-" * 60)
        time.sleep(1)

print(f"\nüíæ Saving to CSV file...")

csv_filename = "technodom_smartphones.csv"
with open(csv_filename, mode='w', encoding='utf-8', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=["Title", "Link", "Reviews"])
    writer.writeheader()
    writer.writerows(all_products)

print(f"‚úÖ Data saved to file: {csv_filename}")


üîÑ Parsing page 1: https://www.technodom.kz/catalog/smartfony-i-gadzhety/smartfony-i-telefony/smartfony?page=1

üì± Product: –°–º–∞—Ä—Ç—Ñ–æ–Ω Apple iPhone 13 4/128GB Midnight
üîó Link: https://www.technodom.kz/p/smartfon-gsm-apple-iphone-13-128gb-thx-61-12-5-midnight-252945?source=catalog&category_code=smartfony-i-gadzhety%2Fsmartfony-i-telefony%2Fsmartfony
üí¨ Reviews:
 - Review: –æ—Ç–ª–∏—á–Ω—ã–π —Å–º–∞—Ä—Ç—Ñ–æ–Ω –≤—Å–µ–º —Å–æ–≤–µ—Ç—É—é—É
 - Review: –û—á–µ–Ω—å —Ö–æ—Ä–æ—à–∏–π —Ç–µ–ª–µ—Ñ–æ–Ω üëçüèªüëçüèª
 - Review: –°—É–ø–µ—Ä –≤—Å–µ –æ—Ç–ª–∏—á–Ω–æ
 - Review: –û—Ç–ª–∏—á–Ω–æ —Ä–∞–±–æ—Ç–∞–µ—Ç, –≤—Å—ë –Ω—Ä–∞–≤–∏—Ç—Å—è
------------------------------------------------------------

üì± Product: –°–º–∞—Ä—Ç—Ñ–æ–Ω Huawei Nova Y72s 256GB –ß–µ—Ä–Ω—ã–π
üîó Link: https://www.technodom.kz/p/smartfon-gsm-huawei-nova-y72s-8-256gb-cherniy-287914?source=catalog&category_code=smartfony-i-gadzhety%2Fsmartfony-i-telefony%2Fsmartfony
‚ùå No reviews found.
------------------------------------------

In [11]:
!pip install deep-translator

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
Installing collected packages: deep-translator
Successfully installed deep-translator-1.11.4


In [13]:
import csv
import time
import pandas as pd
from deep_translator import GoogleTranslator

input_file = "technodom_smartphones.csv"
output_file = "technodom_reviews_for_nlp.csv"
final_file = "technodom_reviews_cleaned_final.csv"

# Step 1: Read and prepare reviews for translation
all_reviews = []
with open(input_file, mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        reviews_raw = row["Reviews"]  # "Reviews" column in Russian
        if reviews_raw != "No reviews":  # Skip "No reviews"
            split_reviews = [r.strip() for r in reviews_raw.split("|||") if r.strip()]
            all_reviews.extend(split_reviews)

# Remove duplicates and empty strings
all_reviews = list(set([r for r in all_reviews if r]))

# Save reviews to a new CSV file with just one column: "review"
with open(output_file, mode='w', encoding='utf-8', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["review"])
    for review in all_reviews:
        writer.writerow([review])

print(f"‚úÖ Reviews saved to NLP-ready file: {output_file}")

# Step 2: Translate reviews using Google Translate
translator = GoogleTranslator(source='auto', target='en')
translated_rows = []

with open(output_file, mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        review = row['review']
        try:
            translated = translator.translate(review)
            translated_rows.append({"original": review, "translated": translated})
            print(f"‚úÖ Translated: {translated}")
            time.sleep(0.5)  # to avoid hitting Google‚Äôs rate limit
        except Exception as e:
            print(f"‚ö†Ô∏è Translation error: {e}")
            translated_rows.append({"original": review, "translated": "TRANSLATION_ERROR"})

# Step 3: Clean the data
df = pd.DataFrame(translated_rows)

# Drop rows where the "translated" column has missing values or errors
df_cleaned = df[df["translated"].notna() & (df["translated"] != "TRANSLATION_ERROR")]

# Step 4: Modify the first line (replace column names) and remove duplicates
df_cleaned = df_cleaned.drop_duplicates()

# Step 5: Remove paragraphs (newlines) and replace them with ". "
df_cleaned['translated'] = df_cleaned['translated'].str.replace('\n', '. ', regex=False)

# Save the final cleaned data
df_cleaned.to_csv(final_file, index=False, quoting=1)

print(f"‚úÖ Final cleaned data saved to {final_file}")

‚úÖ Reviews saved to NLP-ready file: technodom_reviews_for_nlp.csv
‚úÖ Translated: inexpensive
‚úÖ Translated: works)
‚úÖ Translated: The phone is excellent at an affordable price does not hang does not lag supports heavy games. I recommend
‚úÖ Translated: Does not contain built -in applications from Google. A large amount of RAM and internal memory. Enlightenment camera.
‚úÖ Translated: Very good phone
‚úÖ Translated: a lot of attachments, diverse social networks are loaded
‚úÖ Translated: powerful
‚úÖ Translated: The phone is fire, I don‚Äôt know at the expense of games, I took my wife, for two days it calmly holds without aim
‚úÖ Translated: A good flagship
‚úÖ Translated: Everything is fine, works with a bang.
‚úÖ Translated: per 100 üíØ
‚úÖ Translated: While my daughter is happy
‚úÖ Translated: Great smartphone
‚úÖ Translated: Amir is the best seller
‚úÖ Translated: Apple is Apple
‚úÖ Translated: Great phone, very nimble, cool camera and super display. Face ID for Android!
‚úÖ Tr

In [14]:
!aws s3 cp technodom_reviews_cleaned_final.csv s3://zanggar/technodom_reviews/technodom_reviews_cleaned_final

upload: ./technodom_reviews_cleaned_final.csv to s3://zanggar/technodom_reviews/technodom_reviews_cleaned_final
