In [1]:
import pandas as pd
import pyodbc
import paramiko
import numpy as np
import gzip
import os
import csv
from datetime import datetime, timezone

## Download Files

### Most Advs

In [2]:
%%time
import paramiko
import os
from datetime import datetime, timezone

# SFTP credentials and connection details
sftp_host = 'ftp.admarketplace.net'
sftp_port = 8022
username = 'ywang'
password = '123456789'  # Recommend using environment variables for security

# Dictionary to store advertiser SFTP paths, local directories, and file naming conventions
advertisers = {
    'Bloomingdales': {
        'sftp_path': '/sftp/l_bloomingdales/files/',
        'local_path': '/Volumes/T9/AMP/KlarnaShoppingAds/Bloomingdales/',
        'file_pattern': lambda date: f'{date}_Bloomingdales_PLA.csv',
        'final_name': 'Bloomingdales_PLA.csv'
    },
    'Verizon': {
        'sftp_path': '/sftp/l_verizon/files/',
        'local_path': '/Volumes/T9/AMP/KlarnaShoppingAds/Verizon/',
        'file_pattern': 'verizon_devices_admarketplace.csv',
        'final_name': 'Verizon_PLA.csv'
    },
    'BedBathBeyond': {
        'sftp_path': '/sftp/l_BedBathBeyond/files/',
        'local_path': '/Volumes/T9/AMP/KlarnaShoppingAds/BedBathBeyond/',
        'file_pattern': lambda date: f'{date}_BedBathAndBeyond_PLA.csv.gz',
        'final_name': 'BedBathBeyond_PLA.csv.gz'
    },
    'HarryDavid': {
        'sftp_path': '/sftp/l_HarryDavid/files/',
        'local_path': '/Volumes/T9/AMP/KlarnaShoppingAds/HarryDavid/',
        'file_pattern': 'hd_admarketplace.csv',
        'final_name': 'HarryDavid_PLA.csv'
    },
    'TommyBahama': {
        'sftp_path': '/sftp/l_Tommybahama/files/',
        'local_path': '/Volumes/T9/AMP/KlarnaShoppingAds/TommyBahama/',
        'file_pattern': lambda date: f'{date}_TommyBahama_PLA.csv',
        'final_name': 'TommyBahama_PLA.csv'
    },
    'Houzz': {
        'sftp_path': '/sftp/l_houzz/files/pla_feed/',
        'local_path': '/Volumes/T9/AMP/KlarnaShoppingAds/Houzz/',
        'file_pattern': 'houzz_full_catalog.txt.gz',
        'final_name': 'Houzz_PLA.txt.gz'
    },
    'Zappos': {
        'sftp_path': '/sftp/l_Zappos-1/files/',
        'local_path': '/Volumes/T9/AMP/KlarnaShoppingAds/Zappos/',
        'file_pattern': 'zapoos_adsmarketplace.txt.gz',
        'final_name': 'Zappos_PLA.txt.gz'
    },
    'HomeDepot': {
        'sftp_path': '/sftp/l_homedepot/files/',
        'local_path': '/Volumes/T9/AMP/KlarnaShoppingAds/TheHomeDepot/',
        'file_pattern': lambda date: f'{date}_TheHomeDepot_029A.csv.gz',
        'final_name': 'HomeDepot_PLA.csv.gz'
    },
    'NewBalance': {
        'sftp_path': '/sftp/l_Newbalance/files/',
        'local_path': '/Volumes/T9/AMP/KlarnaShoppingAds/NewBalance/',
        'file_pattern': lambda date: f'{date}_NewBalance_ PLA.csv',
        'final_name': 'NewBalance_PLA.csv'
    }
}

# Helper function to check if a file exists locally and was modified today
def is_local_file_modified_today(local_file_path):
    if os.path.exists(local_file_path):
        modification_time = datetime.fromtimestamp(os.path.getmtime(local_file_path))
        return modification_time.date() == datetime.now().date(), modification_time
    return False, None

# Helper function to check if the file on the SFTP server was modified today
def is_sftp_file_modified_today(sftp, file_name):
    try:
        file_attr = sftp.stat(file_name)
        modification_time = datetime.fromtimestamp(file_attr.st_mtime, timezone.utc)
        return modification_time.date() == datetime.now(timezone.utc).date(), modification_time
    except FileNotFoundError:
        return False, None

# Establish SFTP connection once
try:
    transport = paramiko.Transport((sftp_host, sftp_port))
    transport.connect(username=username, password=password)
    sftp = paramiko.SFTPClient.from_transport(transport)

    # Get the current date string
    current_date = datetime.now().strftime('%Y%m%d')

    # Step 1: Process all advertisers
    for advertiser, paths in advertisers.items():
        sftp_folder = paths['sftp_path']
        local_folder = paths['local_path']
        file_pattern = paths['file_pattern']
        final_filename = paths['final_name']  # Static name for overwriting

        # Generate file name for SFTP (dynamic or static)
        sftp_filename = file_pattern(current_date) if callable(file_pattern) else file_pattern
        local_file_path = os.path.join(local_folder, final_filename)  # Always save as fixed name

        # Ensure local folder exists
        os.makedirs(local_folder, exist_ok=True)

        # Check if the file exists locally and was updated today
        local_exists, local_mod_time = is_local_file_modified_today(local_file_path)
        if local_exists:
            print(f"{advertiser}: Local file '{final_filename}' already exists and was modified today ({local_mod_time}). Skipping download.")
            continue  # Skip if already downloaded today

        # Navigate to SFTP folder
        sftp.chdir(sftp_folder)

        # Check if the SFTP file was modified today
        sftp_updated_today, sftp_mod_time = is_sftp_file_modified_today(sftp, sftp_filename)
        if not sftp_updated_today:
            print(f"{advertiser}: File '{sftp_filename}' on SFTP was last modified on {sftp_mod_time}. Skipping download.")
            continue  # Skip if file not updated today

        try:
            # Download the file and save it with a fixed name (overwrite)
            sftp.get(sftp_filename, local_file_path)
            print(f"{advertiser}: Downloaded '{sftp_filename}' and saved as '{final_filename}' (Overwritten).")
        except Exception as e:
            print(f"{advertiser}: Failed to download '{sftp_filename}'. Error: {e}")

except Exception as e:
    print(f"An error occurred while downloading: {e}")

finally:
    if sftp:
        sftp.close()
    if transport:
        transport.close()


Bloomingdales: File '20250609_Bloomingdales_PLA.csv' on SFTP was last modified on None. Skipping download.
Verizon: File 'verizon_devices_admarketplace.csv' on SFTP was last modified on None. Skipping download.
BedBathBeyond: File '20250609_BedBathAndBeyond_PLA.csv.gz' on SFTP was last modified on None. Skipping download.
HarryDavid: Local file 'HarryDavid_PLA.csv' already exists and was modified today (2025-06-09 07:50:25.216850). Skipping download.
TommyBahama: File '20250609_TommyBahama_PLA.csv' on SFTP was last modified on None. Skipping download.
Houzz: File 'houzz_full_catalog.txt.gz' on SFTP was last modified on 2025-03-30 18:56:56+00:00. Skipping download.
Zappos: File 'zapoos_adsmarketplace.txt.gz' on SFTP was last modified on 2025-04-25 19:24:34+00:00. Skipping download.
HomeDepot: File '20250609_TheHomeDepot_029A.csv.gz' on SFTP was last modified on None. Skipping download.
NewBalance: Local file 'NewBalance_PLA.csv' already exists and was modified today (2025-06-09 07:50:27

Under Armour

In [3]:
%%time
import paramiko
import os
from datetime import datetime, timezone

# SFTP credentials and connection details
sftp_host = 'sftp.admarketplace.net'
sftp_port = 22
username = 'underarmour'
password = 'supports-GALE-mobility-postman'  # Recommend using environment variables for security

# Dictionary to store advertiser SFTP paths, local directories, and file naming conventions
advertisers = {
    'UnderArmour': {
        'sftp_path': '/files/',
        'local_path': '/Volumes/T9/AMP/KlarnaShoppingAds/UnderArmour/',
        'file_pattern': 'UA_AMP.csv',
        'final_name': 'UnderArmour_PLA.csv'
    }
}

# Helper function to check if a file exists locally and was modified today
def is_local_file_modified_today(local_file_path):
    if os.path.exists(local_file_path):
        modification_time = datetime.fromtimestamp(os.path.getmtime(local_file_path))
        return modification_time.date() == datetime.now().date(), modification_time
    return False, None

# Helper function to check if the file on the SFTP server was modified today
def is_sftp_file_modified_today(sftp, file_name):
    try:
        file_attr = sftp.stat(file_name)
        modification_time = datetime.fromtimestamp(file_attr.st_mtime, timezone.utc)
        return modification_time.date() == datetime.now(timezone.utc).date(), modification_time
    except FileNotFoundError:
        return False, None

# Establish SFTP connection once
try:
    transport = paramiko.Transport((sftp_host, sftp_port))
    transport.connect(username=username, password=password)
    sftp = paramiko.SFTPClient.from_transport(transport)

    # Get the current date string
    current_date = datetime.now().strftime('%Y%m%d')

    # Step 1: Process all advertisers
    for advertiser, paths in advertisers.items():
        sftp_folder = paths['sftp_path']
        local_folder = paths['local_path']
        file_pattern = paths['file_pattern']
        final_filename = paths['final_name']  # Static name for overwriting

        # Generate file name for SFTP (dynamic or static)
        sftp_filename = file_pattern(current_date) if callable(file_pattern) else file_pattern
        local_file_path = os.path.join(local_folder, final_filename)  # Always save as fixed name

        # Ensure local folder exists
        os.makedirs(local_folder, exist_ok=True)

        # Check if the file exists locally and was updated today
        local_exists, local_mod_time = is_local_file_modified_today(local_file_path)
        if local_exists:
            print(f"{advertiser}: Local file '{final_filename}' already exists and was modified today ({local_mod_time}). Skipping download.")
            continue  # Skip if already downloaded today

        # Navigate to SFTP folder
        sftp.chdir(sftp_folder)

        # Check if the SFTP file was modified today
        sftp_updated_today, sftp_mod_time = is_sftp_file_modified_today(sftp, sftp_filename)
        if not sftp_updated_today:
            print(f"{advertiser}: File '{sftp_filename}' on SFTP was last modified on {sftp_mod_time}. Skipping download.")
            continue  # Skip if file not updated today

        try:
            # Download the file and save it with a fixed name (overwrite)
            sftp.get(sftp_filename, local_file_path)
            print(f"{advertiser}: Downloaded '{sftp_filename}' and saved as '{final_filename}' (Overwritten).")
        except Exception as e:
            print(f"{advertiser}: Failed to download '{sftp_filename}'. Error: {e}")

except Exception as e:
    print(f"An error occurred while downloading: {e}")

finally:
    if sftp:
        sftp.close()
    if transport:
        transport.close()


UnderArmour: Local file 'UnderArmour_PLA.csv' already exists and was modified today (2025-06-09 07:50:13.119061). Skipping download.
CPU times: user 5.38 ms, sys: 3.56 ms, total: 8.94 ms
Wall time: 460 ms


<!-- ### Spanx -->

In [None]:
# # SFTP credentials and connection details
# sftp_host = 'sftpgo.feedonomics.com'
# sftp_port = 22  # Default port for SFTP
# username = 'fdx_eb4e950355841'
# password = 'c0dff59b60d30e836cd6a5f0'  # Recommend using environment variables for credentials

# advertisers = {
#     'Spanx': {
#         'sftp_path': '/incoming/',
#         'local_path': '/Volumes/T9/AMP/KlarnaShoppingAds/Spanx/',
#         'file_pattern': 'Spanx_AdMarketplace.csv.gz'  # Example: 20241017_Bloomingdales_PLA.csv
#     }
# }

# # Helper function to check if a file was modified today
# def is_file_modified_today(sftp, file_name):
#     file_attr = sftp.stat(file_name)
#     modification_time = datetime.fromtimestamp(file_attr.st_mtime, timezone.utc)
#     return modification_time.date() == datetime.now(timezone.utc).date(), modification_time

# # Helper function to check if the local file already exists and was modified today
# def is_local_file_modified_today(local_file_path):
#     if os.path.exists(local_file_path):
#         modification_time = datetime.fromtimestamp(os.path.getmtime(local_file_path))
#         return modification_time.date() == datetime.now().date(), modification_time
#     return False, None

# # Establish SFTP connection once
# try:
#     transport = paramiko.Transport((sftp_host, sftp_port))
#     transport.connect(username=username, password=password)
#     sftp = paramiko.SFTPClient.from_transport(transport)

#     # Get the current date string
#     current_date = datetime.now().strftime('%Y%m%d')

#     # Step 1: Download all files from SFTP into local folders
#     for advertiser, paths in advertisers.items():
#         sftp_folder = paths['sftp_path']
#         local_folder = paths['local_path']
#         file_pattern = paths['file_pattern']  # Naming convention for the file

#         # Generate the file name based on whether it's a callable (lambda) or a static string
#         input_file_name = file_pattern(current_date) if callable(file_pattern) else file_pattern
#         local_file_path = os.path.join(local_folder, input_file_name)

#         # Ensure the local folder exists, but don't create the file yet
#         os.makedirs(local_folder, exist_ok=True)

#         # Check if the local file exists and was modified today
#         local_exists, local_mod_time = is_local_file_modified_today(local_file_path)
#         if local_exists:
#             print(f"{advertiser}: Local file '{input_file_name}' already exists and was modified on {local_mod_time}. Skipping download.")
#             continue

#         # Navigate to the advertiser's SFTP directory
#         sftp.chdir(sftp_folder)

#         # For fixed-name files, check if the file was updated today before downloading
#         if not callable(file_pattern):  # For static file names
#             sftp_updated_today, sftp_mod_time = is_file_modified_today(sftp, input_file_name)
#             if not sftp_updated_today:
#                 print(f"{advertiser}: File '{input_file_name}' on SFTP was last modified on {sftp_mod_time}. Skipping download.")
#                 continue  # Skip download if file was not updated today

#         try:
#             # Download the file from SFTP to the local system (local file will only be created if download is successful)
#             sftp.get(input_file_name, local_file_path)
#             print(f"{advertiser}: File downloaded from SFTP and saved locally as {local_file_path}")
#         except Exception as e:
#             print(f"{advertiser}: Failed to download '{input_file_name}'. Error: {e}")
#             # Remove any partially created or empty file after download failure
#             if os.path.exists(local_file_path):
#                 os.remove(local_file_path)

# except Exception as e:
#     print(f"An error occurred while downloading: {e}")

# finally:
#     if sftp:
#         sftp.close()
#     if transport:
#         transport.close()

Spanx: File downloaded from SFTP and saved locally as C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Spanx/Spanx_AdMarketplace.csv.gz


### Ulta

In [42]:
%%time
import os
import requests
from datetime import datetime

# Specify the folder path
folder_path = '/Volumes/T9/AMP/KlarnaShoppingAds/Ulta'

# Generate the current timestamp in 'yyyymmddhhmmss' format for the filename
current_timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
input_file_name = f'feed_{current_timestamp}_33000020.txt'
output_file_name = 'ulta.tsv'

# Create the full file paths
input_file_path = os.path.join(folder_path, input_file_name)
output_file_path = os.path.join(folder_path, output_file_name)

# URL for the daily download
url = "https://webadapters.channeladvisor.com/CSEAdapter/Default.aspx?pid=V%5bP%5e%5eC%5ePAosvB6Z.X%5b3KePQjFGq_%5bZX2%5bLd%22(%3dsFt4%5b%60%26K2Ic%23)gwz%3d7Z%5eY%5bbI_SQ8DLu_U%2f%26%5ebucR(%3cwz"

# Download the file
try:
    response = requests.get(url)
    response.raise_for_status()
    
    # Save the downloaded content to the specified input file
    with open(input_file_path, 'wb') as file:
        file.write(response.content)
    print(f"Downloaded file saved as: {input_file_path}")

    # Read the content from the text file and save it as TSV
    with open(input_file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        
    with open(output_file_path, 'w', encoding='utf-8') as tsv_file:
        tsv_file.write(content)
    
    print(f"File has been saved as TSV at: {output_file_path}")

except requests.HTTPError as e:
    print(f"HTTP error occurred: {e}")
except UnicodeDecodeError:
    print("Error: Could not decode the file. Please check the file encoding or try using a different encoding.")

Downloaded file saved as: C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Ulta\feed_20250424161740_33000020.txt
File has been saved as TSV at: C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Ulta\ulta.tsv
CPU times: total: 2.45 s
Wall time: 5.12 s


### UK ADVS

In [11]:
%%time
import os
import requests
import pandas as pd
from datetime import datetime

# Dictionary of advertisers with corrected URLs
advertisers = {
    "Sephora_UK": {
        "folder": "/Volumes/T9/AMP/KlarnaShoppingAds/Sephora",
        "url": "http://files-as.intelligentreach.com/feedexports/1a627511-1fcb-4ea2-885c-b849e10a8688/Feel_Unique_UK_Admarketplace.tsv"
    },
    "MyProtein_UK": {
        "folder": "/Volumes/T9/AMP/KlarnaShoppingAds/My Protein/UK",
        "url": "https://admarketplace:KIBC07LjHMTTcSdhRzmd@productfeeds.thehut.net/feeds/Myprotein_uk_admarketplace_2.csv"
    },
    "LookFantastic_UK": {
        "folder": "/Volumes/T9/AMP/KlarnaShoppingAds/Look Fantastic/UK",
        "url": "https://admarketplace:KIBC07LjHMTTcSdhRzmd@productfeeds.thehut.net/feeds/admarketplace_LFUK_feed.csv"
    },
    "LookFantastic_FR": {
        "folder": "/Volumes/T9/AMP/KlarnaShoppingAds/Look Fantastic/FR",
        "url": "https://admarketplace:KIBC07LjHMTTcSdhRzmd@productfeeds.thehut.net/feeds/admarketplace_LFFR_feed.csv"
    },
    "LookFantastic_IT": {
        "folder": "/Volumes/T9/AMP/KlarnaShoppingAds/Look Fantastic/IT",
        "url": "https://admarketplace:KIBC07LjHMTTcSdhRzmd@productfeeds.thehut.net/feeds/admarketplace_LFIT_feed.csv"
    },
    "NewLook_UK": {
        "folder": "/Volumes/T9/AMP/KlarnaShoppingAds/New Look",
        "url": "https://webadapters.channeladvisor.com/CSEAdapter/Default.aspx?pid=TZPa_C_PAhuxHe*-X%5b%5eI%2f%23QjJJq_%5c%5bU1%5bv%2f%23%5cmBLvb%5cXTK3t%2f%26*iEG%3dk(*P%5bbE1XQnGGs_*%2fX00E%5dX%5b%3drI"
    },
    "Vodafone_UK": {
        "folder": "/Volumes/T9/AMP/KlarnaShoppingAds/Vodafone",
        "url": "https://feed-download.bigupdata.co.uk/download/?lnk=0ef2b49a301a46c29ff0acf5a913ad75"
    }
}

# Columns to remove for Sephora
redundant_columns = {
    "c:GA_product_id", "c:nov_score", "c:profit_margin_flag", "c:returns_margin_flag", "c:feed_market",
    "c:allocation", "c:stock_level_flag", "c:IM_product_name", "c:IM_beauty_brand", "c:IM_product_category",
    "c:IM_range", "c:flag_plus_size", "c:flag_maternity", "c:flag_maternity", "product_width", "product_length",
    "product_height", "shipping_weight", "shipping_length", "unit_pricing_measure", "unit_pricing_base_measure",
    "product_review_average", "c:Main_Highlights", "c:Shortened_Name_Ads", "c:Alternative Image URL (1)",
    "c:display ads link", "display ads title", "adwords_labels", "adwords_grouping", "c:Was Price (inc VAT)",
    "c:ultimate_price", "promotion_id", "subscription_cost", "instalment", "custom_label_0", "custom_label_1",
    "custom_label_2", "custom_label_3", "custom_label_4", "custom_number_0", "custom_number_1", "custom_number_2",
    "custom_number_3", "custom_number_4", "c:shipping(country:price:min_handling_time:max_han)", "shipping_label",
    "shopping_ads_excluded_country", "excluded_destination", "return_policy_label", "max_handling_time",
    "min_handling_time", "isbn", "identifier exists"
}

# Generate timestamp for unique filenames
current_timestamp = datetime.now().strftime('%Y%m%d%H%M%S')

# Process each advertiser
for advertiser, data in advertisers.items():
    folder_path = data["folder"]
    url = data["url"]

    # Ensure folder exists
    os.makedirs(folder_path, exist_ok=True)

    # Determine file extension from URL
    extension = ".csv" if url.endswith(".csv") else ".tsv" if url.endswith(".tsv") else ".txt"
    delimiter = "," if extension == ".csv" else "\t"

    # Generate input and output file names
    input_file_name = f'feed_{current_timestamp}{extension}'
    output_file_name = f'{advertiser.lower()}.tsv'  # Save output as .tsv

    # Full file paths
    input_file_path = os.path.join(folder_path, input_file_name)
    output_file_path = os.path.join(folder_path, output_file_name)

    # Download the file
    try:
        response = requests.get(url)
        response.raise_for_status()

        # Save the raw downloaded content
        with open(input_file_path, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded file for {advertiser} saved as: {input_file_path}")

        # Attempt to read file with correct delimiter
        df = pd.read_csv(input_file_path, sep=delimiter, low_memory=False, dtype=str, on_bad_lines="skip")
        
        # Check if all data is in one column and try alternative delimiter
        if len(df.columns) == 1:
            print(f"Warning: {advertiser} data appears to be in one column. Attempting to split...")
            df = pd.read_csv(input_file_path, sep="\t" if delimiter == "," else ",", low_memory=False, dtype=str, on_bad_lines="skip")
        
        # Remove redundant columns for Sephora
        if advertiser == "Sephora":
            df = df.drop(columns=[col for col in redundant_columns if col in df.columns], errors='ignore')
        
        # Save the processed file as TSV
        df.to_csv(output_file_path, sep='\t', index=False)

        print(f"Processed and saved: {output_file_path}")

    except requests.HTTPError as e:
        print(f"HTTP error occurred while downloading {advertiser}: {e}")
    except UnicodeDecodeError:
        print(f"Error: Could not decode file for {advertiser}. Please check encoding.")
    except Exception as e:
        print(f"Unexpected error while processing {advertiser}: {e}")

print("All downloads complete.")




Downloaded file for Sephora_UK saved as: /Volumes/T9/AMP/KlarnaShoppingAds/Sephora/feed_20250609072943.tsv
Processed and saved: /Volumes/T9/AMP/KlarnaShoppingAds/Sephora/sephora_uk.tsv
Downloaded file for MyProtein_UK saved as: /Volumes/T9/AMP/KlarnaShoppingAds/My Protein/UK/feed_20250609072943.csv
Processed and saved: /Volumes/T9/AMP/KlarnaShoppingAds/My Protein/UK/myprotein_uk.tsv
Downloaded file for LookFantastic_UK saved as: /Volumes/T9/AMP/KlarnaShoppingAds/Look Fantastic/UK/feed_20250609072943.csv
Processed and saved: /Volumes/T9/AMP/KlarnaShoppingAds/Look Fantastic/UK/lookfantastic_uk.tsv
Downloaded file for LookFantastic_FR saved as: /Volumes/T9/AMP/KlarnaShoppingAds/Look Fantastic/FR/feed_20250609072943.csv
Processed and saved: /Volumes/T9/AMP/KlarnaShoppingAds/Look Fantastic/FR/lookfantastic_fr.tsv
Downloaded file for LookFantastic_IT saved as: /Volumes/T9/AMP/KlarnaShoppingAds/Look Fantastic/IT/feed_20250609072943.csv
Processed and saved: /Volumes/T9/AMP/KlarnaShoppingAds/Lo

In [12]:
%%time
import pandas as pd
import urllib.parse
import os

# Define advertiser-specific settings
advertisers = {
    "LookFantastic_UK": {"folder": "/Volumes/T9/AMP/KlarnaShoppingAds/Look Fantastic/UK", "ctaid": "1203"},
    "LookFantastic_FR": {"folder": "/Volumes/T9/AMP/KlarnaShoppingAds/Look Fantastic/FR", "ctaid": "1203"},
    "LookFantastic_IT": {"folder": "/Volumes/T9/AMP/KlarnaShoppingAds/Look Fantastic/IT", "ctaid": "1203"},
    "NewLook_UK": {"folder": "/Volumes/T9/AMP/KlarnaShoppingAds/New Look", "ctaid": "1205"},
    "Sephora_UK": {"folder": "/Volumes/T9/AMP/KlarnaShoppingAds/Sephora", "ctaid": "1325"},
    "MyProtein_UK": {"folder": "/Volumes/T9/AMP/KlarnaShoppingAds/My Protein/UK", "ctaid": "1209"},
    "Vodafone_UK": {"folder": "/Volumes/T9/AMP/KlarnaShoppingAds/Vodafone", "ctaid": "1153"}
}

# Standardized column names
standard_columns = [
    "SKU/id", "Name", "Price", "Shipping costs", "Stock status", "Delivery time", "Manufacturer",
    "EAN/GTIN", "Manufacturer SKU / MPN", "URL", "Image URL", "Category", "Description", "Condition",
    "Sale Price", "Sale Price Effective Date", "Color", "Size", "SizeSystem", "AdultContent",
    "AgeGroup", "Bundled", "EnergyEfficiencyClass", "Gender", "GroupId", "Material", "Multipack", "Pattern"
]

# Column renaming mappings
column_mapping_general = {
    'id': 'SKU/id', 'title': 'Name', 'price': 'Price', 'shipping': 'Shipping costs', 'availability': 'Stock status',
    'availability_date': 'Delivery time', 'brand': 'Manufacturer', 'gtin': 'EAN/GTIN', 'mpn': 'Manufacturer SKU / MPN',
    'link': 'URL', 'image_link': 'Image URL', 'google_product_category': 'Category', 'description': 'Description',
    'adult': 'AdultContent', 'age_group': 'AgeGroup', 'color': 'Color', 'condition': 'Condition', 'item_group_id': 'GroupId',
    'material': 'Material', 'pattern': 'Pattern', 'size': 'Size', 'size_system': 'SizeSystem', 'sale_price': 'Sale Price',
    'sale_price_effective_date': 'Sale Price Effective Date', 'gender': 'Gender', 'multipack': 'Multipack',
    'bundled': 'Bundled', 'energy_efficiency_class': 'EnergyEfficiencyClass'
}

column_mapping_myprotein = {
    'ID': 'SKU/id', 'Title': 'Name', 'Price': 'Price', 'Exit URL': 'URL', 'Image URL': 'Image URL',
    'Availability': 'Stock status', 'Brand': 'Manufacturer', 'gtin': 'EAN/GTIN',
    'Google Product Category': 'Category', 'Description': 'Description', 'Size': 'Size'
}

column_mapping_lookfantastic = {
    'id': 'SKU/id', 'name': 'Name', 'price': 'Price', 'producturl': 'URL', 'bigimage': 'Image URL',
    'instock': 'Stock status', 'brand': 'Manufacturer', 'gtin': 'EAN/GTIN', 'saleprice': 'Sale Price',
    'googlecategory': 'Category', 'Description': 'Description', 'Size': 'Size','gender': 'Gender'
}

# Base URL template for Klarna tracking
base_url_template = "https://klarnashoppingads.ampxdirect.com/?plid=9z0zxe52a9&ctaid={ctaid}&v=1.3&source=als_tiles"

# Process each advertiser
for advertiser, data in advertisers.items():
    folder_path = data["folder"]
    ctaid = data["ctaid"]
    input_file_path = os.path.join(folder_path, f"{advertiser.lower()}.tsv")
    output_file_path = os.path.join(folder_path, f"amp_klarna_{advertiser.lower()}.tsv.gz")

    if not os.path.exists(input_file_path):
        print(f"Skipping {advertiser}: File not found ({input_file_path})")
        continue

    df = pd.read_csv(input_file_path, sep='\t', low_memory=False, dtype=str, on_bad_lines="skip", quoting=3)
    
    # LookFantastic-specific fix
    if advertiser.startswith("LookFantastic") and "producturl" in df.columns: 
        if "producturl" in df.columns:   
            df.rename(columns={"producturl": "link"}, inplace=True)
        df.rename(columns=column_mapping_lookfantastic, inplace=True)
    
    # Apply MyProtein-specific processing
    if advertiser == "MyProtein_UK":
        if "Exit URL" in df.columns:
            df.rename(columns={"Exit URL": "URL"}, inplace=True)
        df.rename(columns=column_mapping_myprotein, inplace=True)
    else:
        df.rename(columns=column_mapping_general, inplace=True)
    
    if 'URL' not in df.columns:
        print(f"Skipping {advertiser}: 'URL' column missing")
        continue

    df['URL'] = df['URL'].astype(str).fillna('')
    base_url = base_url_template.format(ctaid=ctaid)
    df['URL'] = df['URL'].apply(lambda x: f"{base_url}&cu={urllib.parse.quote_plus(x)}&fbu={urllib.parse.quote_plus(x)}")

    # Clean 'EAN/GTIN'
    if 'EAN/GTIN' in df.columns:
        df['EAN/GTIN'] = df['EAN/GTIN'].astype(str).apply(lambda x: x.rstrip('.0') if '.0' in x else x)

    # Ensure numerical columns retain original values
    for col in ['SKU/id', 'EAN/GTIN', 'Price']:
        if col in df.columns:
            df[col] = df[col].astype(str).fillna('')

    # Add missing columns
    for col in standard_columns:
        if col not in df.columns:
            df[col] = ""

    df = df[standard_columns]
    df.to_csv(output_file_path, sep='\t', index=False, compression='gzip')
    print(f"Processed and saved: {output_file_path}")

print("All advertisers processed successfully.")

Processed and saved: /Volumes/T9/AMP/KlarnaShoppingAds/Look Fantastic/UK/amp_klarna_lookfantastic_uk.tsv.gz
Processed and saved: /Volumes/T9/AMP/KlarnaShoppingAds/Look Fantastic/FR/amp_klarna_lookfantastic_fr.tsv.gz
Processed and saved: /Volumes/T9/AMP/KlarnaShoppingAds/Look Fantastic/IT/amp_klarna_lookfantastic_it.tsv.gz
Processed and saved: /Volumes/T9/AMP/KlarnaShoppingAds/New Look/amp_klarna_newlook_uk.tsv.gz
Processed and saved: /Volumes/T9/AMP/KlarnaShoppingAds/Sephora/amp_klarna_sephora_uk.tsv.gz
Processed and saved: /Volumes/T9/AMP/KlarnaShoppingAds/My Protein/UK/amp_klarna_myprotein_uk.tsv.gz
Processed and saved: /Volumes/T9/AMP/KlarnaShoppingAds/Vodafone/amp_klarna_vodafone_uk.tsv.gz
All advertisers processed successfully.
CPU times: user 5.81 s, sys: 283 ms, total: 6.09 s
Wall time: 6.2 s


In [None]:
%%time
import os
import paramiko
import glob

# Define advertiser-specific settings
advertisers = {
    "LookFantastic_UK": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Look Fantastic/UK/",
    "LookFantastic_FR": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Look Fantastic/FR/",
    "LookFantastic_IT": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Look Fantastic/IT/",
    # "NewLook_UK": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/New Look",
    "Sephora_UK": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Sephora/",
    "MyProtein_UK": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/My Protein/UK/",
    "Vodafone_UK": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Voda/fone/",
    "BedBathBeyond_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/BedBathBeyond/",
    "Bloomingdales_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Bloomingdales/",
    "HarryDavid_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/HarryDavid/",
    "Houzz_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Houzz/",
    "NewBalance_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/NewBalance/",
    "TheHomeDepot_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/TheHomeDepot/",
    "TommyBahama_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/TommyBahama/",
    "Ulta_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Ulta/",
    "Verizon_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Verizon/",
    "Wayfair_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Wayfair/",
    "Zappos_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Zappos",
    "UnderArmour_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/UnderArmour/"
}

# SFTP Configuration
sftp_host = "dev-sftp.admarketplace.net"
sftp_port = 22
sftp_username = "l_klarnapricerun"
sftp_password = "9ir5nukn2JGEPDC5AZsiett4"
sftp_target_folder = "/sftp/l_klarnapricerun/files"

# Upload files to SFTP
def upload_to_sftp(local_file_path, remote_file_path):
    try:
        transport = paramiko.Transport((sftp_host, sftp_port))
        transport.connect(username=sftp_username, password=sftp_password)
        
        # Use SFTPClient.from_transport to initialize the SFTP session
        with paramiko.SFTPClient.from_transport(transport) as sftp:
            sftp.put(local_file_path.replace("\\", "/"), remote_file_path)
            print(f"Successfully uploaded: {local_file_path} -> {remote_file_path}")
        
        transport.close()
    except Exception as e:
        print(f"Failed to upload {local_file_path}: {e}")

# Upload all processed files
for advertiser, folder_path in advertisers.items():
    file_pattern = os.path.join(folder_path, f"amp_klarna_{advertiser.lower()}.tsv.gz").replace("\\", "/")
    matching_files = glob.glob(file_pattern)
    
    if not matching_files:
        print(f"Skipping {advertiser}: Processed file not found ({file_pattern})")
        continue
    
    for output_file_path in matching_files:
        remote_file_path = os.path.join(sftp_target_folder, os.path.basename(output_file_path)).replace("\\", "/")
        upload_to_sftp(output_file_path, remote_file_path)

print("All processed files uploaded successfully.")

Failed to upload C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Look Fantastic/IT/amp_klarna_lookfantastic_it.tsv.gz: failure: open no such file or directory
All processed files uploaded successfully.
CPU times: total: 62.5 ms
Wall time: 654 ms


In [None]:
%%time
import os
import time
import glob
import paramiko

# Advertiser folders
advertisers = {
    "LookFantastic_UK": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Look Fantastic/UK/",
    "LookFantastic_FR": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Look Fantastic/FR/",
    "LookFantastic_IT": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Look Fantastic/IT/",
    "Sephora_UK": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Sephora/",
    "MyProtein_UK": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/My Protein/UK/",
    "Vodafone_UK": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Vodafone/",
    "BedBathBeyond_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/BedBathBeyond/",
    "Bloomingdales_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Bloomingdales/",
    "HarryDavid_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/HarryDavid/",
    "Houzz_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Houzz/",
    "NewBalance_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/NewBalance/",
    "TheHomeDepot_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/TheHomeDepot/",
    "TommyBahama_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/TommyBahama/",
    "Ulta_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Ulta/",
    "Verizon_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Verizon/",
    "Wayfair_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Wayfair/",
    "Zappos_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Zappos",
    "UnderArmour_US": "C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/UnderArmour/"
}

# SFTP credentials
sftp_host = "dev-sftp.admarketplace.net"
sftp_port = 22
sftp_username = "l_klarnapricerun"
sftp_password = "9ir5nukn2JGEPDC5AZsiett4"
sftp_target_folder = "/sftp/l_klarnapricerun/files"

# Upload function
def upload_to_sftp(local_file_path, remote_file_path):
    try:
        transport = paramiko.Transport((sftp_host, sftp_port))
        transport.connect(username=sftp_username, password=sftp_password)
        with paramiko.SFTPClient.from_transport(transport) as sftp:
            sftp.put(local_file_path.replace("\\", "/"), remote_file_path)
            print(f"✅ Uploaded: {os.path.basename(local_file_path)}")
        transport.close()
    except Exception as e:
        print(f"❌ Failed to upload {local_file_path}: {e}")

# Gather all uploadable files
upload_tasks = []
for advertiser, folder_path in advertisers.items():
    expected_filename = f"amp_klarna_{advertiser.lower()}.tsv.gz"
    file_path = os.path.join(folder_path, expected_filename).replace("\\", "/")
    if os.path.exists(file_path):
        remote_path = os.path.join(sftp_target_folder, os.path.basename(file_path)).replace("\\", "/")
        upload_tasks.append((advertiser, file_path, remote_path))
    else:
        print(f"⚠️ Skipping {advertiser} — file not found: {file_path}")

# Try different batch sizes
for batch_size in range(2, len(upload_tasks) + 1):
    print(f"\n🚀 Starting test with batch size: {batch_size}")
    for i in range(0, len(upload_tasks), batch_size):
        batch = upload_tasks[i:i + batch_size]
        print(f"⏳ Uploading batch: {[adv for adv, _, _ in batch]}")

        for adv, local_file, remote_file in batch:
            upload_to_sftp(local_file, remote_file)

        time.sleep(1)  # 1 second delay between batches
    print(f"✅ Finished round for batch size {batch_size}")

print("\n🎉 All batch test uploads complete.")



🚀 Starting test with batch size: 2
⏳ Uploading batch: ['LookFantastic_UK', 'LookFantastic_FR']
✅ Uploaded: amp_klarna_lookfantastic_uk.tsv.gz
❌ Failed to upload C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Look Fantastic/FR/amp_klarna_lookfantastic_fr.tsv.gz: failure: open no such file or directory
⏳ Uploading batch: ['LookFantastic_IT', 'Sephora_UK']
✅ Uploaded: amp_klarna_lookfantastic_it.tsv.gz
✅ Uploaded: amp_klarna_sephora_uk.tsv.gz
⏳ Uploading batch: ['MyProtein_UK', 'Vodafone_UK']
❌ Failed to upload C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/My Protein/UK/amp_klarna_myprotein_uk.tsv.gz: failure: open no such file or directory
❌ Failed to upload C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Vodafone/amp_klarna_vodafone_uk.tsv.gz: failure: open no such file or directory
⏳ Uploading batch: ['BedBathBeyond_US', 'Bloomingdales_US']
❌ Failed to upload C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/BedBathBeyond/amp_klarna_bedbathbeyond_us.tsv.gz: failure: open 

Socket exception: An existing connection was forcibly closed by the remote host (10054)


❌ Failed to upload C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/BedBathBeyond/amp_klarna_bedbathbeyond_us.tsv.gz: 
❌ Failed to upload C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/Bloomingdales/amp_klarna_bloomingdales_us.tsv.gz: failure: open no such file or directory
⏳ Uploading batch: ['HarryDavid_US', 'NewBalance_US', 'TheHomeDepot_US', 'TommyBahama_US']
✅ Uploaded: amp_klarna_harrydavid_us.tsv.gz
✅ Uploaded: amp_klarna_newbalance_us.tsv.gz
❌ Failed to upload C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/TheHomeDepot/amp_klarna_thehomedepot_us.tsv.gz: failure: open no such file or directory
❌ Failed to upload C:/Users/ywang/Documents/Codes/Shopping_Ads/Klarna/TommyBahama/amp_klarna_tommybahama_us.tsv.gz: failure: open no such file or directory
⏳ Uploading batch: ['Ulta_US', 'Verizon_US', 'Zappos_US', 'UnderArmour_US']
✅ Uploaded: amp_klarna_ulta_us.tsv.gz
✅ Uploaded: amp_klarna_verizon_us.tsv.gz
✅ Uploaded: amp_klarna_zappos_us.tsv.gz
✅ Uploaded: amp_klarna_underarm

KeyboardInterrupt: 

In [5]:
%%time
import os
import time
import glob
import paramiko

advertisers = {
    "LookFantastic_UK": "/Volumes/T9/AMP/KlarnaShoppingAds/Look Fantastic/UK/",
    "LookFantastic_FR": "/Volumes/T9/AMP/KlarnaShoppingAds/Look Fantastic/FR/",
    "LookFantastic_IT": "/Volumes/T9/AMP/KlarnaShoppingAds/Look Fantastic/IT/",
    "Sephora_UK": "/Volumes/T9/AMP/KlarnaShoppingAds/Sephora/",
    "MyProtein_UK": "/Volumes/T9/AMP/KlarnaShoppingAds/My Protein/UK/",
    "Vodafone_UK": "/Volumes/T9/AMP/KlarnaShoppingAds/Vodafone/",
    "BedBathBeyond_US": "/Volumes/T9/AMP/KlarnaShoppingAds/BedBathBeyond/",
    "Bloomingdales_US": "/Volumes/T9/AMP/KlarnaShoppingAds/Bloomingdales/",
    "HarryDavid_US": "/Volumes/T9/AMP/KlarnaShoppingAds/HarryDavid/",
    "Houzz_US": "/Volumes/T9/AMP/KlarnaShoppingAds/Houzz/",
    "NewBalance_US": "/Volumes/T9/AMP/KlarnaShoppingAds/NewBalance/",
    "TheHomeDepot_US": "/Volumes/T9/AMP/KlarnaShoppingAds/TheHomeDepot/",
    "TommyBahama_US": "/Volumes/T9/AMP/KlarnaShoppingAds/TommyBahama/",
    "Ulta_US": "/Volumes/T9/AMP/KlarnaShoppingAds/Ulta/",
    "Verizon_US": "/Volumes/T9/AMP/KlarnaShoppingAds/Verizon/",
    "Wayfair_US": "/Volumes/T9/AMP/KlarnaShoppingAds/Wayfair/",
    "Zappos_US": "/Volumes/T9/AMP/KlarnaShoppingAds/Zappos",
    "UnderArmour_US": "/Volumes/T9/AMP/KlarnaShoppingAds/UnderArmour/"
}

sftp_host = "dev-sftp.admarketplace.net"
sftp_port = 22
sftp_username = "l_klarnapricerun"
sftp_password = "9ir5nukn2JGEPDC5AZsiett4"
sftp_target_folder = "/files"

def upload_to_sftp(local_file_path, remote_file_path):
    transport = None
    try:
        transport = paramiko.Transport((sftp_host, sftp_port))
        transport.connect(username=sftp_username, password=sftp_password)
        with paramiko.SFTPClient.from_transport(transport) as sftp:
            sftp.put(local_file_path.replace("\\", "/"), remote_file_path)
            print(f"✅ Uploaded: {os.path.basename(local_file_path)}")
    except Exception as e:
        print(f"❌ Failed to upload {local_file_path}: {e}")
    finally:
        if transport and transport.is_active():
            transport.close()

upload_tasks = []
for advertiser, folder_path in advertisers.items():
    expected_filename = f"amp_klarna_{advertiser.lower()}.tsv.gz"
    file_path = os.path.join(folder_path, expected_filename).replace("\\", "/")
    if os.path.exists(file_path):
        remote_path = os.path.join(sftp_target_folder, os.path.basename(file_path)).replace("\\", "/")
        upload_tasks.append((advertiser, file_path, remote_path))
    else:
        print(f"⚠️ Skipping {advertiser} — file not found: {file_path}")


for tasks in upload_tasks:
    print(f"upload, {tasks[0]}, {tasks[1]},{tasks[2]}")
    upload_to_sftp(tasks[1], tasks[2])


# for batch_size in range(2, len(upload_tasks) + 1):
#     print(f"\n🚀 Starting test with batch size: {batch_size}")
#     for i in range(0, len(upload_tasks), batch_size):
#         batch = upload_tasks[i:i + batch_size]
#         print(f"⏳ Uploading batch: {[adv for adv, _, _ in batch]}")

#         for adv, local_file, remote_file in batch:
#             upload_to_sftp(local_file, remote_file)

#         # time.sleep(1)  # Delay between batches
#     print(f"✅ Finished round for batch size {batch_size}")

print("\n🎉 All batch test uploads complete.")


upload, LookFantastic_UK, C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Look Fantastic/UK/amp_klarna_lookfantastic_uk.tsv.gz,/files/amp_klarna_lookfantastic_uk.tsv.gz
✅ Uploaded: amp_klarna_lookfantastic_uk.tsv.gz
upload, LookFantastic_FR, C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Look Fantastic/FR/amp_klarna_lookfantastic_fr.tsv.gz,/files/amp_klarna_lookfantastic_fr.tsv.gz
❌ Failed to upload C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Look Fantastic/FR/amp_klarna_lookfantastic_fr.tsv.gz: failure: open no such file or directory
upload, LookFantastic_IT, C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Look Fantastic/IT/amp_klarna_lookfantastic_it.tsv.gz,/files/amp_klarna_lookfantastic_it.tsv.gz
❌ Failed to upload C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Look Fantastic/IT/amp_klarna_lookfantastic_it.tsv.gz: failure: open no such file or directory
upload, Sephora_UK, C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Sephora/amp_klarna_sephora_uk.tsv.gz,/

Socket exception: An existing connection was forcibly closed by the remote host (10054)


❌ Failed to upload C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Wayfair/amp_klarna_wayfair_us.tsv.gz: 
upload, Zappos_US, C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Zappos/amp_klarna_zappos_us.tsv.gz,/files/amp_klarna_zappos_us.tsv.gz
❌ Failed to upload C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Zappos/amp_klarna_zappos_us.tsv.gz: failure: open no such file or directory
upload, UnderArmour_US, C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/UnderArmour/amp_klarna_underarmour_us.tsv.gz,/files/amp_klarna_underarmour_us.tsv.gz
❌ Failed to upload C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/UnderArmour/amp_klarna_underarmour_us.tsv.gz: failure: open no such file or directory

🎉 All batch test uploads complete.
CPU times: total: 40.8 s
Wall time: 3min 50s


In [None]:
%%time
import os
import paramiko
import glob
from pathlib import Path

# Define advertiser-specific settings
advertisers = {
    "LookFantastic_UK": "/Volumes/T9/AMP/KlarnaShoppingAds/Look Fantastic/UK/",
    "LookFantastic_FR": "/Volumes/T9/AMP/KlarnaShoppingAds/Look Fantastic/FR/",
    "LookFantastic_IT": "/Volumes/T9/AMP/KlarnaShoppingAds/Look Fantastic/IT/",
    "Sephora_UK": "/Volumes/T9/AMP/KlarnaShoppingAds/Sephora/",
    "MyProtein_UK": "/Volumes/T9/AMP/KlarnaShoppingAds/My Protein/UK/",
    "Vodafone_UK": "/Volumes/T9/AMP/KlarnaShoppingAds/Vodafone/",
    "BedBathBeyond_US": "/Volumes/T9/AMP/KlarnaShoppingAds/BedBathBeyond/",
    "Bloomingdales_US": "/Volumes/T9/AMP/KlarnaShoppingAds/Bloomingdales/",
    "HarryDavid_US": "/Volumes/T9/AMP/KlarnaShoppingAds/HarryDavid/",
    "Houzz_US": "/Volumes/T9/AMP/KlarnaShoppingAds/Houzz/",
    "NewBalance_US": "/Volumes/T9/AMP/KlarnaShoppingAds/NewBalance/",
    "TheHomeDepot_US": "/Volumes/T9/AMP/KlarnaShoppingAds/TheHomeDepot/",
    "TommyBahama_US": "/Volumes/T9/AMP/KlarnaShoppingAds/TommyBahama/",
    "Ulta_US": "/Volumes/T9/AMP/KlarnaShoppingAds/Ulta/",
    "Verizon_US": "/Volumes/T9/AMP/KlarnaShoppingAds/Verizon/",
    "Wayfair_US": "/Volumes/T9/AMP/KlarnaShoppingAds/Wayfair/",
    "Zappos_US": "/Volumes/T9/AMP/KlarnaShoppingAds/Zappos/",  # ✅ Fixed
    "UnderArmour_US": "/Volumes/T9/AMP/KlarnaShoppingAds/UnderArmour/",
    "Nike_UK": "/Volumes/T9/AMP/KlarnaShoppingAds/Nike/UK/"
}

# SFTP Configuration
sftp_host = "ftp.admarketplace.net"
sftp_port = 8022
sftp_username = "l_klarnapricerun"
sftp_password = "9ir5nukn2JGEPDC5AZsiett4"
sftp_target_folder = "/files"

# Upload files to SFTP
def upload_to_sftp(local_file_path, remote_file_path):
    try:
        transport = paramiko.Transport((sftp_host, sftp_port))
        transport.connect(username=sftp_username, password=sftp_password)
        
        with paramiko.SFTPClient.from_transport(transport) as sftp:
            sftp.put(local_file_path, remote_file_path)
            print(f"✅ Uploaded: {local_file_path} -> {remote_file_path}")
        
        transport.close()
    except Exception as e:
        print(f"❌ Failed to upload {local_file_path}: {e}")

# Upload all processed files
for advertiser, folder in advertisers.items():
    folder_path = Path(folder)
    file_pattern = str(folder_path / f"amp_klarna_{advertiser.lower()}.tsv.gz")
    
    print(f"🔍 Searching for: {file_pattern}")
    matching_files = glob.glob(file_pattern)

    if not matching_files:
        print(f"⚠️ Skipping {advertiser}: No matching files found.")
        continue

    for local_file in matching_files:
        if not os.path.isfile(local_file):
            print(f"⚠️ File not found: {local_file}")
            continue

        remote_file_path = f"{sftp_target_folder}/{Path(local_file).name}"
        upload_to_sftp(local_file, remote_file_path)

print("🚀 All processed files uploaded (or skipped if not found).")


🔍 Searching for: C:\Users\ywang\Documents\Codes\Shopping Ads\Klarna\Look Fantastic\UK\amp_klarna_lookfantastic_uk.tsv.gz
✅ Uploaded: C:\Users\ywang\Documents\Codes\Shopping Ads\Klarna\Look Fantastic\UK\amp_klarna_lookfantastic_uk.tsv.gz -> /files/amp_klarna_lookfantastic_uk.tsv.gz
🔍 Searching for: C:\Users\ywang\Documents\Codes\Shopping Ads\Klarna\Look Fantastic\FR\amp_klarna_lookfantastic_fr.tsv.gz
✅ Uploaded: C:\Users\ywang\Documents\Codes\Shopping Ads\Klarna\Look Fantastic\FR\amp_klarna_lookfantastic_fr.tsv.gz -> /files/amp_klarna_lookfantastic_fr.tsv.gz
🔍 Searching for: C:\Users\ywang\Documents\Codes\Shopping Ads\Klarna\Look Fantastic\IT\amp_klarna_lookfantastic_it.tsv.gz
✅ Uploaded: C:\Users\ywang\Documents\Codes\Shopping Ads\Klarna\Look Fantastic\IT\amp_klarna_lookfantastic_it.tsv.gz -> /files/amp_klarna_lookfantastic_it.tsv.gz
🔍 Searching for: C:\Users\ywang\Documents\Codes\Shopping Ads\Klarna\Sephora\amp_klarna_sephora_uk.tsv.gz
✅ Uploaded: C:\Users\ywang\Documents\Codes\Shoppi

In [23]:
import pandas as pd

# Define input and output file paths
input_gz_file = "/Volumes/T9/AMP/KlarnaShoppingAds/Vodafone/amp_klarna_vodafone.tsv.gz"  # Replace with actual file path
output_tsv_file = "/Volumes/T9/AMP/KlarnaShoppingAds/Vodafone/output_sample.tsv"

# Load the first 30 rows from the compressed TSV file with error handling
df_sample = pd.read_csv(input_gz_file, sep='\t', low_memory=False, dtype=str, nrows=30, quoting=3, compression='gzip', on_bad_lines="skip")

# Save the sample as a new TSV file
df_sample.to_csv(output_tsv_file, sep='\t', index=False)

print(f"Sample file saved as: {output_tsv_file}")


Sample file saved as: C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Vodafone/output_sample.tsv


In [28]:
# Path to your original .tsv.gz file
file_path = '/Volumes/T9/AMP/KlarnaShoppingAds/Wayfair/amp_klarna_wayfair.tsv.gz'

# Path to save the extracted 30-row TSV file
output_file_path = file_path.replace('.tsv.gz', '_first_30_rows.tsv')

# Read the first 30 rows of the .tsv.gz file
with gzip.open(file_path, 'rt', encoding='utf-8') as file:
    df = pd.read_csv(file, sep='\t', nrows=30)

# Save the first 30 rows as a new TSV file
df.to_csv(output_file_path, sep='\t', index=False)

print(f"File with the first 30 rows has been saved as {output_file_path}")

File with the first 30 rows has been saved as C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Wayfair/amp_klarna_wayfair_first_30_rows.tsv


## Functions for Each Adv

In [17]:
import os
import requests
import pandas as pd
import csv
import urllib.parse
from datetime import datetime

# Dictionary of advertisers with corrected URLs
advertisers = {
    "Sephora": {
        "folder": "/Volumes/T9/AMP/KlarnaShoppingAds/Sephora",
        "url": "http://files-as.intelligentreach.com/feedexports/1a627511-1fcb-4ea2-885c-b849e10a8688/Feel_Unique_UK_Admarketplace.tsv"
    }
}

# Columns to remove
redundant_columns = {
    "c:GA_product_id", "c:nov_score", "c:profit_margin_flag", "c:returns_margin_flag", "c:feed_market",
    "c:allocation", "c:stock_level_flag", "c:IM_product_name", "c:IM_beauty_brand", "c:IM_product_category",
    "c:IM_range", "c:flag_plus_size", "c:flag_maternity", "c:flag_maternity", "product_width", "product_length",
    "product_height", "shipping_weight", "shipping_length", "unit_pricing_measure", "unit_pricing_base_measure",
    "product_review_average", "c:Main_Highlights", "c:Shortened_Name_Ads", "c:Alternative Image URL (1)",
    "c:display ads link", "display ads title", "adwords_labels", "adwords_grouping", "c:Was Price (inc VAT)",
    "c:ultimate_price", "promotion_id", "subscription_cost", "instalment", "custom_label_0", "custom_label_1",
    "custom_label_2", "custom_label_3", "custom_label_4", "custom_number_0", "custom_number_1", "custom_number_2",
    "custom_number_3", "custom_number_4", "c:shipping(country:price:min_handling_time:max_han)", "shipping_label",
    "shopping_ads_excluded_country", "excluded_destination", "return_policy_label", "max_handling_time",
    "min_handling_time", "isbn", "identifier exists"
}

# Generate timestamp for unique filenames
current_timestamp = datetime.now().strftime('%Y%m%d%H%M%S')

# Process each advertiser
for advertiser, data in advertisers.items():
    folder_path = data["folder"]
    url = data["url"]

    # Ensure folder exists
    os.makedirs(folder_path, exist_ok=True)

    # Determine file extension from URL
    if url.endswith(".csv"):
        extension = ".csv"
        delimiter = ","  # CSV files use commas
    elif url.endswith(".tsv"):
        extension = ".tsv"
        delimiter = "\t"  # TSV files use tabs
    else:
        extension = ".txt"  # Default to .txt if unknown
        delimiter = " "  # Assume space-separated if structured

    # Generate input and output file names
    input_file_name = f'feed_{current_timestamp}{extension}'
    output_file_name = f'{advertiser.lower()}.tsv.gz'  # Save output as .tsv.gz

    # Full file paths
    input_file_path = os.path.join(folder_path, input_file_name)
    output_file_path = os.path.join(folder_path, output_file_name)

    # Download the file
    try:
        response = requests.get(url)
        response.raise_for_status()

        # Save the raw downloaded content
        with open(input_file_path, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded file for {advertiser} saved as: {input_file_path}")

        # Convert content to TSV format
        df = pd.read_csv(input_file_path, sep=delimiter, low_memory=False, dtype=str, on_bad_lines="skip")
        
        # Remove redundant columns
        df = df.drop(columns=[col for col in redundant_columns if col in df.columns], errors='ignore')
        
        # Save the processed file as a compressed TSV
        df.to_csv(output_file_path, sep='\t', index=False, compression='gzip')

        print(f"Processed and saved: {output_file_path}")

    except requests.HTTPError as e:
        print(f"HTTP error occurred while downloading {advertiser}: {e}")
    except UnicodeDecodeError:
        print(f"Error: Could not decode file for {advertiser}. Please check encoding.")
    except Exception as e:
        print(f"Unexpected error while processing {advertiser}: {e}")

print("All downloads complete.")


Downloaded file for Sephora saved as: C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Sephora\feed_20250310123041.tsv
Processed and saved: C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Sephora\sephora.tsv.gz
All downloads complete.


## Functions for Each Adv

### BedBathBeyond

In [3]:
%%time
import gzip
import csv

def unzip_gz_to_csv(gz_file_path, output_csv_file_path):
    # Open the gz file in text mode with UTF-8 encoding
    with gzip.open(gz_file_path, 'rt', encoding='utf-8') as gz_file:
        # Open the output CSV file in write mode
        with open(output_csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
            reader = csv.reader(gz_file, delimiter=',')  # Assume it's comma-separated
            writer = csv.writer(csv_file, delimiter=',')  # Writing CSV format

            for row in reader:
                # Write each row to the CSV file
                writer.writerow(row)

# Example usage
gz_file_path = '/Volumes/T9/AMP/KlarnaShoppingAds/BedBathBeyond/BedBathBeyond_PLA.csv.gz'
output_csv_file_path = '/Volumes/T9/AMP/KlarnaShoppingAds/BedBathBeyond/BBB_admarketplace.csv'

unzip_gz_to_csv(gz_file_path, output_csv_file_path)

CPU times: total: 3min 42s
Wall time: 3min 49s


In [4]:
%%time
import pandas as pd
import csv

# Path to your original CSV file
csv_file_path = '/Volumes/T9/AMP/KlarnaShoppingAds/BedBathBeyond/BBB_admarketplace.csv'

# Output TSV file path
tsv_file_path = csv_file_path.replace('.csv', '.tsv')

# Function to remove double quotes from all values in the dataframe
def remove_double_quotes(chunk):
    return chunk.apply(lambda col: col.map(lambda x: x.replace('"', '') if isinstance(x, str) else x))

# Step 1: Convert CSV to TSV
chunksize = 10000  # Process 10,000 rows at a time

with pd.read_csv(csv_file_path, sep=',', quotechar='"', quoting=csv.QUOTE_ALL, low_memory=False, dtype=str, chunksize=chunksize) as reader:
    for i, chunk in enumerate(reader):
        # Step 2: Remove double quotes from the chunk
        chunk = remove_double_quotes(chunk)

        # Step 3: Write the chunk to the TSV file
        chunk.to_csv(tsv_file_path, sep='\t', index=False, mode='w' if i == 0 else 'a', 
                     header=(i == 0), quoting=csv.QUOTE_NONE, escapechar='\\')

# Step 4: Confirm that the file has been saved
print(f"CSV has been converted to TSV, and double quotes have been removed. Cleaned file saved at: {tsv_file_path}")

CSV has been converted to TSV, and double quotes have been removed. Cleaned file saved at: C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/BedBathBeyond/BBB_admarketplace.tsv
CPU times: total: 4min 26s
Wall time: 4min 35s


In [None]:
%%time

import pandas as pd
import urllib.parse
import os

# Path to your original TSV file
file_path = '/Volumes/T9/AMP/KlarnaShoppingAds/BedBathBeyond/BBB_admarketplace.tsv'

# Base URL to append
base_url = 'https://klarnashoppingads.ampxdirect.com/?plid=9z0zxe52a9&ctaid=25116&v=1.3&source=als_tiles'

# Function to encode the link and append it to the base URL
def create_new_link(original_link):
    encoded_link = urllib.parse.quote_plus(original_link)
    return f"{base_url}&cu={encoded_link}&fbu={encoded_link}"

# Output file path with .tsv.gz
output_file_path = os.path.join(os.path.dirname(file_path), 'amp_klarna_bedbathbeyond_us.tsv.gz')

# Process the TSV file in chunks
chunksize = 10000  # Process 10,000 rows at a time
with pd.read_csv(file_path, sep='\t', low_memory=False, dtype=str, on_bad_lines='skip', chunksize=chunksize) as reader:
    for i, chunk in enumerate(reader):
        # Ensure the 'Link' column is treated as strings and fill NaN with an empty string
        if 'Link' in chunk.columns:
            chunk['Link'] = chunk['Link'].astype(str).fillna('')

            # Apply the function to create a new link
            chunk['Link'] = chunk['Link'].apply(create_new_link)

        # Append the processed chunk to the output file with gzip compression
        if i == 0:
            # Write the header for the first chunk
            chunk.to_csv(output_file_path, sep='\t', index=False, mode='w', compression='gzip')
        else:
            # Append subsequent chunks without writing the header
            chunk.to_csv(output_file_path, sep='\t', index=False, mode='a', header=False, compression='gzip')

print(f"File with updated links has been saved as {output_file_path}")



File with updated links has been saved as C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/BedBathBeyond\amp_klarna_bedbathbeyond_us.tsv.gz
CPU times: total: 8min 49s
Wall time: 9min 3s


### Zappos

In [35]:
%%time
import pandas as pd
import gzip

# Path to your .csv.gz file
file_path = '/Volumes/T9/AMP/KlarnaShoppingAds/Zappos/Zappos_PLA.txt.gz'

# Path to save the extracted TSV file (with a .tsv extension)
output_file_path = file_path.replace('.txt.gz', '.tsv')

# Read the .gz file with error handling and specify encoding
with gzip.open(file_path, 'rt', encoding='utf-8', errors='replace') as file:
    df = pd.read_csv(file, sep='\t', on_bad_lines='skip')

# Save the dataframe as a TSV file (without index)
df.to_csv(output_file_path, sep='\t', index=False)

print(f"File has been saved as {output_file_path}")


File has been saved as C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Zappos/Zappos_PLA.tsv
CPU times: total: 1min 27s
Wall time: 1min 27s


In [36]:
%%time

import pandas as pd
import urllib.parse

# Path to your original TSV file
file_path = '/Volumes/T9/AMP/KlarnaShoppingAds/Zappos/zapoos_adsmarketplace.tsv'

# Read the TSV file
df = pd.read_csv(file_path, sep='\t', low_memory=False, dtype=str)

# Ensure the 'link' column is treated as strings and fill NaN with an empty string
df['link'] = df['link'].astype(str).fillna('')

# Remove anything after '?' in the 'link' column
df['link'] = df['link'].apply(lambda x: x.split('?')[0])

# Base URL to append
base_url = 'https://klarnashoppingads.ampxdirect.com/?plid=9z0zxe52a9&ctaid=1377&v=1.3&source=als_tiles'

# Function to encode the link and append it to the base URL
def create_new_link(original_link):
    encoded_link = urllib.parse.quote_plus(original_link)
    new_link = f"{base_url}&cu={encoded_link}&fbu={encoded_link}"
    return new_link

# Apply the function to the 'link' column
df['link'] = df['link'].apply(create_new_link)

# Column renaming based on the required mapping
column_mapping = {
    'id': 'SKU/id',
    'title': 'Name',
    'description': 'Description',
    'google_product_category': 'Category',
    'link': 'URL',
    'image_link': 'Image URL',
    'condition': 'Condition',
    'availability': 'Stock status',
    'price': 'Price',
    'brand': 'Manufacturer',
    'gtin': 'EAN/GTIN',  # Ensuring GTIN remains a string
    'mpn': 'Manufacturer SKU / MPN',
    'gender': 'Gender',
    'age_group': 'AgeGroup',
    'color': 'Color',
    'size': 'Size',
    'item_group_id': 'GroupId',
    'material': 'Material',
    'pattern': 'Pattern',
    'shipping': 'Shipping costs'  # Adjust if this column represents shipping costs
}

# Rename columns
df.rename(columns=column_mapping, inplace=True)

# Ensure that 'EAN/GTIN' is treated as a string and remove any '.0' from GTIN values
df['SKU/id'] = df['SKU/id'].astype(str).apply(lambda x: x.rstrip('.0') if '.0' in x else x)
df['EAN/GTIN'] = df['EAN/GTIN'].astype(str).apply(lambda x: x.rstrip('.0') if '.0' in x else x)

# Step to handle numeric columns that show decimal
# Identify columns that can be safely converted to integers, excluding 'EAN/GTIN'
numeric_cols = df.columns[df.apply(lambda col: col.str.isnumeric(), axis=0).all()]
numeric_cols = numeric_cols.drop('EAN/GTIN', errors='ignore')  # Exclude 'EAN/GTIN'

# Convert those columns to integers explicitly
df[numeric_cols] = df[numeric_cols].apply(lambda x: pd.to_numeric(x, errors='coerce').fillna(0).astype(int))

# List of missing columns based on the requirements
missing_columns = ['AdultContent', 'Delivery time', 'Bundled', 'EnergyEfficiencyClass', 'Multipack', 'SizeSystem']

# Add missing columns with empty values or default values
for col in missing_columns:
    df[col] = ''  # Set as empty or default as needed

# Save the updated dataframe with renamed columns and new field)

output_file_path = os.path.join(os.path.dirname(file_path), 'amp_klarna_zappos_us.tsv.gz')
df.to_csv(output_file_path, sep='\t', index=False, compression='gzip')

print(f"File with updated links, renamed columns, and added missing columns has been saved as {output_file_path}")

File with updated links, renamed columns, and added missing columns has been saved as C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Zappos\amp_klarna_zappos_us.tsv.gz
CPU times: total: 2min 30s
Wall time: 2min 30s


Under Armour

In [14]:
%%time
import paramiko
import os
import pandas as pd
import urllib.parse
from datetime import datetime

# Path to your original CSV file
local_file_path = '/Volumes/T9/AMP/KlarnaShoppingAds/UnderArmour/UnderArmour_PLA.csv'

# Proceed with the rest of the code to process the file
df = pd.read_csv(local_file_path, sep=',', low_memory=False, dtype=str)

# Ensure the 'Link' column is treated as strings and fill NaN with an empty string
df['Link'] = df['Link'].astype(str).fillna('')

# Base URL to append
base_url = 'https://klarnashoppingads.ampxdirect.com/?plid=9z0zxe52a9&ctaid=1145&v=1.3&source=als_tiles'

# Function to encode the link and append it to the base URL
def create_new_link(original_link):
    encoded_link = urllib.parse.quote_plus(original_link)
    new_link = f"{base_url}&cu={encoded_link}&fbu={encoded_link}"
    return new_link

# Apply the function to the 'Link' column
df['Link'] = df['Link'].apply(create_new_link)

# Column renaming based on the required mapping
column_mapping = {
    'ID': 'SKU/id',
    'Title': 'Name',
    'Description': 'Description',
    'Link': 'URL',
    'Image Link': 'Image URL',
    'Condition': 'Condition',
    'Availability': 'Stock status',
    'Price': 'Price',
    'Brand': 'Manufacturer',
    'GTIN': 'EAN/GTIN',
    'MPN': 'Manufacturer SKU / MPN',
    'Gender': 'Gender',
    'Age Group': 'AgeGroup',
    'Color': 'Color',
    'Size': 'Size',
    'Google Product Category': 'Category',
    'Sale Price': 'Sale Price',
    'Sale Price Effective Date': 'Sale Price Effective Date',
    'Expiration Date': 'Expiration Date',
    'Mobile Link': 'Mobile Link'
}

# Rename columns based on the mapping
df.rename(columns=column_mapping, inplace=True)

# List of columns to convert to integers to remove '.0'
columns_to_convert = ['SKU/id', 'EAN/GTIN']

# Function to remove '.0' by converting to integer where possible
def remove_decimal(value):
    try:
        value_float = float(value)
        if value_float.is_integer():
            return str(int(value_float))
        return value
    except (ValueError, TypeError):
        return value

# Apply the function to the specified columns
for col in columns_to_convert:
    if col in df.columns:
        df[col] = df[col].apply(remove_decimal)

# Identify other numeric columns that can be safely converted to integers
numeric_cols = df.select_dtypes(include=['object']).columns.difference(columns_to_convert)
numeric_cols = numeric_cols[df[numeric_cols].apply(lambda col: col.str.isnumeric().all())]

# Convert those columns to integers explicitly
df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(0).astype(int))

# List of missing columns based on the requirements
missing_columns = ['AdultContent', 'Delivery time', 'Bundled', 'EnergyEfficiencyClass', 'Multipack', 'SizeSystem']

# Add missing columns with empty values or default values
for col in missing_columns:
    df[col] = ''  # Set as empty or default as needed

# Save the final output as a compressed TSV.GZ file
output_file_path = os.path.join(os.path.dirname(local_file_path), 'amp_klarna_underarmour_us.tsv.gz')
df.to_csv(output_file_path, sep='\t', index=False, compression='gzip')

print(f"File with updated links, renamed columns, and added missing columns has been saved as {output_file_path}")



File with updated links, renamed columns, and added missing columns has been saved as C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/UnderArmour\amp_klarna_underarmour_us.tsv.gz
CPU times: total: 5.61 s
Wall time: 5.78 s


### Bloomingdales

In [15]:
%%time
import paramiko
import os
import pandas as pd
import urllib.parse
from datetime import datetime


# Path to your original TSV file
local_file_path = '/Volumes/T9/AMP/KlarnaShoppingAds/Bloomingdales/Bloomingdales_PLA.csv'

# Proceed with the rest of the code to process the file
df = pd.read_csv(local_file_path, sep=',', low_memory=False, dtype=str)

# Ensure the 'Link' column is treated as strings and fill NaN with an empty string
df['Link'] = df['Link'].astype(str).fillna('')

# Base URL to append
base_url = 'https://klarnashoppingads.ampxdirect.com/?plid=9z0zxe52a9&ctaid=1017&v=1.3&source=als_tiles'

# Function to encode the link and append it to the base URL
def create_new_link(original_link):
    encoded_link = urllib.parse.quote_plus(original_link)
    new_link = f"{base_url}&cu={encoded_link}&fbu={encoded_link}"
    return new_link

# Apply the function to the 'Link' column
df['Link'] = df['Link'].apply(create_new_link)

# Column renaming based on the required mapping
column_mapping = {
    'ID': 'SKU/id',
    'Title': 'Name',
    'Description': 'Description',
    'Link': 'URL',
    'Image Link': 'Image URL',
    'Condition': 'Condition',
    'Availability': 'Stock status',
    'Price': 'Price',
    'Brand': 'Manufacturer',
    'GTIN': 'EAN/GTIN',
    'MPN': 'Manufacturer SKU / MPN',
    'Gender': 'Gender',
    'Age Group': 'AgeGroup',
    'Color': 'Color',
    'Size': 'Size',
    'Google Product Category': 'Category',
    'Sale Price': 'Sale Price',
    'Sale Price Effective Date': 'Sale Price Effective Date',
    'Expiration Date': 'Expiration Date',
    'Mobile Link': 'Mobile Link'
}

# Rename columns based on the mapping
df.rename(columns=column_mapping, inplace=True)

# List of columns to convert to integers to remove '.0'
columns_to_convert = ['SKU/id', 'EAN/GTIN']

# Function to remove '.0' by converting to integer where possible
def remove_decimal(value):
    try:
        value_float = float(value)
        if value_float.is_integer():
            return str(int(value_float))
        return value
    except (ValueError, TypeError):
        return value

# Apply the function to the specified columns
for col in columns_to_convert:
    if col in df.columns:
        df[col] = df[col].apply(remove_decimal)

# Identify other numeric columns that can be safely converted to integers
numeric_cols = df.select_dtypes(include=['object']).columns.difference(columns_to_convert)
numeric_cols = numeric_cols[df[numeric_cols].apply(lambda col: col.str.isnumeric().all())]

# Convert those columns to integers explicitly
df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(0).astype(int))

# List of missing columns based on the requirements
missing_columns = ['AdultContent', 'Delivery time', 'Bundled', 'EnergyEfficiencyClass', 'Multipack', 'SizeSystem']

# Add missing columns with empty values or default values
for col in missing_columns:
    df[col] = ''  # Set as empty or default as needed

# Save the final output as a compressed TSV.GZ file
output_file_path = os.path.join(os.path.dirname(local_file_path), 'amp_klarna_bloomingdales_us.tsv.gz')
df.to_csv(output_file_path, sep='\t', index=False, compression='gzip')

print(f"File with updated links, renamed columns, and added missing columns has been saved as {output_file_path}")



File with updated links, renamed columns, and added missing columns has been saved as C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Bloomingdales\amp_klarna_bloomingdales_us.tsv.gz
CPU times: total: 51.9 s
Wall time: 52.6 s


### Verizon

In [16]:
%%time
import paramiko
import os
import pandas as pd
import urllib.parse
from datetime import datetime

# SFTP credentials and connection details
sftp_host = 'ftp.admarketplace.net'
sftp_port = 8022  # Default port for SFTP
username = 'ywang'
password = '123456789'  # Recommend using environment variables for credentials

# Establish SFTP connection
try:
    transport = paramiko.Transport((sftp_host, sftp_port))
    transport.connect(username=username, password=password)
    sftp = paramiko.SFTPClient.from_transport(transport)

    # Navigate to the directory where the file is located
    target_directory = '/sftp/l_verizon/files/'  # Ensure this is the correct directory
    sftp.chdir(target_directory)

    # File naming pattern (assumed static here, but adjust if it varies)
    input_file_name = 'verizon_devices_admarketplace.csv'
    local_file_path = os.path.join('/Volumes/T9/AMP/KlarnaShoppingAds/Verizon', input_file_name)

    # Download the file from SFTP to your local system
    sftp.get(input_file_name, local_file_path)
    print(f"File downloaded from SFTP and saved locally as {local_file_path}")

except Exception as e:
    print(f"An error occurred: {e}")
finally:
    if sftp:
        sftp.close()
    if transport:
        transport.close()

# Check if the file exists and is not empty
if os.path.exists(local_file_path) and os.path.getsize(local_file_path) > 0:
    # The file exists and has data, so let's process it
    try:
        df = pd.read_csv(local_file_path, low_memory=False, dtype=str)
        df['link'] = df['link'].astype(str).fillna('')

        # Base URL to append
        base_url = 'https://klarnashoppingads.ampxdirect.com/?plid=9z0zxe52a9&ctaid=1149&v=1.3&source=als_tiles'

        # Function to encode the link and append it to the base URL
        def create_new_link(original_link):
            encoded_link = urllib.parse.quote_plus(original_link)
            new_link = f"{base_url}&cu={encoded_link}&fbu={encoded_link}"
            return new_link

        # Apply the function to the 'link' column
        df['link'] = df['link'].apply(create_new_link)

        # Column renaming based on the required mapping
        column_mapping = {
            'id': 'SKU/id',
            'title': 'Name',
            'description': 'Description',
            'google_product_category': 'Category',
            'product_type': 'Product Type',
            'link': 'URL',
            'image_link': 'Image URL',
            'condition': 'Condition',
            'availability': 'Stock status',
            'price': 'Price',
            'brand': 'Manufacturer',
            'gtin': 'EAN/GTIN',
            'mpn': 'Manufacturer SKU / MPN',
            'color': 'Color',
            'size': 'Size',
            'shipping': 'Shipping costs',
            'custom_label_0': 'Custom Label 0',
            'custom_label_1': 'Custom Label 1',
            'custom_label_2': 'Custom Label 2',
            'custom_label_3': 'Custom Label 3',
            'custom_label_4': 'Custom Label 4',
            'short_title': 'Short Title',
            'gender': 'Gender',
            'age_group': 'AgeGroup',
            'installment': 'Installment',
            'availability_date': 'Availability Date'
        }

        # Rename columns
        df.rename(columns=column_mapping, inplace=True)

        # Function to remove decimals from the SKU and GTIN columns
        def remove_decimal(value):
            try:
                value_float = float(value)
                if value_float.is_integer():
                    return str(int(value_float))
                return value
            except (ValueError, TypeError):
                return value

        # Apply this function to necessary columns
        columns_to_convert = ['SKU/id', 'EAN/GTIN']
        for col in columns_to_convert:
            if col in df.columns:
                df[col] = df[col].apply(remove_decimal)

        # Output file path and saving as TSV
        output_file_path = os.path.join(os.path.dirname(local_file_path), 'amp_klarna_verizon_us.tsv.gz')
        df.to_csv(output_file_path, sep='\t', index=False, compression='gzip')

        print(f"Processed file has been saved as {output_file_path}")

    except pd.errors.EmptyDataError:
        print("Error: The CSV file is empty.")
else:
    print(f"Error: The file {local_file_path} does not exist or is empty.")




File downloaded from SFTP and saved locally as C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Verizon\verizon_devices_admarketplace.csv
Processed file has been saved as C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Verizon\amp_klarna_verizon_us.tsv.gz
CPU times: total: 688 ms
Wall time: 1.41 s


### New Balance

In [None]:
def process_newbalance(df):
    try:
        df = pd.read_csv(local_file_path, low_memory=False, dtype=str)
        df['link'] = df['link'].astype(str).fillna('')

        # Base URL to append
        base_url = 'https://klarnashoppingads.ampxdirect.com/?plid=9z0zxe52a9&ctaid=1335&v=1.3&source=als_tiles'

        # Function to encode the link and append it to the base URL
        def create_new_link(original_link):
            encoded_link = urllib.parse.quote_plus(original_link)
            new_link = f"{base_url}&cu={encoded_link}&fbu={encoded_link}"
            return new_link

        # Apply the function to the 'link' column
        df['link'] = df['link'].apply(create_new_link)

        # Column renaming based on the required mapping
        column_mapping = {
            'GTIN': 'EAN/GTIN',
            'MPN': 'Manufacturer SKU / MPN',
            'ID': 'SKU/id',
            'Link': 'URL',
            'Title': 'Name',
            'Description': 'Description',
            'Image Link': 'Image URL',
            'Price': 'Price',
            'Condition': 'Condition',
            'Availability': 'Stock status',
            'Brand': 'Manufacturer',
            'Google Product Category': 'Category',
            'Top Performing Product': 'Bundled',  # Assuming relation
            'Color': 'Color',
            'Size': 'Size',
            'Gender': 'Gender',
            'Age Group': 'AgeGroup',
            'Sale Price': 'Sale Price',
            'Sale Price Effective Date': 'Sale Price Effective Date',
            'Expiration Date': 'Expiration Date',
    # Additional mappings from second part
            'SizeSystem': 'SizeSystem',
            'AdultContent': 'AdultContent',
            'Delivery time': 'Delivery time',  # Mapped directly from the second part
            'EnergyEfficiencyClass': 'EnergyEfficiencyClass',
            'GroupId': 'GroupId',
            'Material': 'Material',
            'Multipack': 'Multipack',
            'Pattern': 'Pattern'
        }


        # Rename columns
        df.rename(columns=column_mapping, inplace=True)

        # Function to remove decimals from the SKU and GTIN columns
        def remove_decimal(value):
            try:
                value_float = float(value)
                if value_float.is_integer():
                    return str(int(value_float))
                return value
            except (ValueError, TypeError):
                return value

        # Apply this function to necessary columns
        columns_to_convert = ['SKU/id', 'EAN/GTIN']
        for col in columns_to_convert:
            if col in df.columns:
                df[col] = df[col].apply(remove_decimal)

    except pd.errors.EmptyDataError:
        print("Error: The CSV file is empty.")
    except FileNotFoundError:
        print(f"Error: The file {local_file_path} does not exist or is empty.")

Ulta

In [None]:
import pandas as pd
import urllib.parse
import os

# Path to your original TSV file
file_path = '/Volumes/T9/AMP/KlarnaShoppingAds/Ulta/ulta.tsv'

# Read the TSV file
df = pd.read_csv(file_path, sep='\t', low_memory=False, dtype=str)

# Ensure the 'link' column is treated as strings and fill NaN with an empty string
df['link'] = df['link'].astype(str).fillna('')

# Base URL to append
base_url = 'https://klarnashoppingads.ampxdirect.com/?partner=klarnashoppingads&sub1=shoppingads&ctaid=74843&v=1.3&source=als_tiles'

# Function to encode the link and append it to the base URL
def create_new_link(original_link):
    encoded_link = urllib.parse.quote_plus(original_link)
    new_link = f"{base_url}&cu={encoded_link}&fbu={encoded_link}"
    return new_link

# Apply the function to the 'link' column
df['link'] = df['link'].apply(create_new_link)

# Column renaming based on the required mapping
column_mapping = {
    'id': 'SKU/id',
    'title': 'Name',
    'price': 'Price',
    'shipping': 'Shipping costs',  
    'availability': 'Stock status',  
    'availability_date': 'Delivery time',  
    'brand': 'Manufacturer',
    'gtin': 'EAN/GTIN',
    'mpn': 'Manufacturer SKU / MPN',
    'link': 'URL',
    'image_link': 'Image URL',
    'google_product_category': 'Category',
    'description': 'Description',
    'adult': 'AdultContent',
    'age_group': 'AgeGroup',
    'color': 'Color',
    'condition': 'Condition',
    'item_group_id': 'GroupId',
    'material': 'Material',
    'pattern': 'Pattern',
    'size': 'Size',
    'size_system': 'SizeSystem',
}

# Rename columns
df.rename(columns=column_mapping, inplace=True)

# Ensure that 'EAN/GTIN' is treated as a string and remove any '.0' from GTIN values
df['EAN/GTIN'] = df['EAN/GTIN'].astype(str).apply(lambda x: x.rstrip('.0') if '.0' in x else x)

# Convert numeric columns to integers
numeric_cols = df.columns[df.apply(lambda col: col.str.isnumeric(), axis=0).all()]
numeric_cols = numeric_cols.drop('EAN/GTIN', errors='ignore')  

# Convert numeric columns to integers explicitly
df[numeric_cols] = df[numeric_cols].apply(lambda x: pd.to_numeric(x, errors='coerce').fillna(0).astype(int))

# List of missing columns
missing_columns = ['AdultContent', 'Delivery time', 'Bundled', 'EnergyEfficiencyClass', 'Multipack', 'SizeSystem']

# Add missing columns with default values
for col in missing_columns:
    df[col] = ''  # Set as empty or default as needed

# Set the output file path with `.tsv.gz`
output_file_path = os.path.join(os.path.dirname(file_path), 'amp_klarna_ulta_us.tsv.gz')

# Save the DataFrame as a compressed TSV (.tsv.gz)
df.to_csv(output_file_path, sep='\t', index=False, compression='gzip')

print(f"File with updated links, renamed columns, and added missing columns has been saved as {output_file_path}")

File with updated links, renamed columns, and added missing columns has been saved as C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Ulta\amp_klarna_ulta.tsv.gz


## Proceed with File and Save

In [15]:
%%time
import pandas as pd
import urllib.parse
import os

# Define the processing function for each file
def process_file(advertiser, local_file_path, output_file_path):
    # Read the CSV file (adjust reading logic per advertiser's format if needed)
    df = pd.read_csv(local_file_path, low_memory=False, dtype=str)

    # Processing logic specific to each advertiser
    if advertiser == 'Bloomingdales':
        df = process_bloomingdales(df)
        
    elif advertiser == 'Verizon':
        df = process_verizon(df)
    elif advertiser == 'NewBalance':
        df = process_newbalance(df)
    #     

    # # Add other advertiser-specific cases here (e.g., Tommy Bahama, HomeDepot, etc.)
    
    # Save the processed file as TSV (with gzip compression)
    df.to_csv(output_file_path, sep='\t', index=False, compression='gzip')
    print(f"{advertiser}: Processed file has been saved as {output_file_path}")


# Step 2: Process each file individually after downloading
for advertiser, paths in advertisers.items():
    local_folder = paths['local_path']
    
    # Generate input file name (static or dynamic)
    input_file_name = paths['file_pattern'](current_date) if callable(paths['file_pattern']) else paths['file_pattern']
    local_file_path = os.path.join(local_folder, input_file_name)
    
    # Output file for processed data
    output_file_path = os.path.join(local_folder, f'amp_klarna_{advertiser}.tsv.gz')
    
    # Process the file with advertiser-specific logic
    process_file(advertiser, local_file_path, output_file_path)




Bloomingdales: Processed file has been saved as C:/Users/ywang/Documents/Codes/Shopping Ads/Klarna/Bloomingdales/amp_klarna_Bloomingdales.tsv.gz


AttributeError: 'NoneType' object has no attribute 'to_csv'