## preprocess.ipynb
これはcsvファイルの前処理用のNotebookです

In [3]:
import sys
sys.path.append("../")

In [4]:
import os
from PIL import Image
from torchvision import transforms
import pandas as pd
import torch
import timm

  from .autonotebook import tqdm as notebook_tqdm


In [31]:
df = pd.read_csv("./All Appliances_img.csv")

In [32]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,img_path
0,0,Pigeon by Stovekraft Amaze Plus Electric Kettl...,appliances,All Appliances,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Pigeon-Amaze-Plus-1-5-Lt...,3.9,128941,₹599,"₹1,245",
1,1,Pigeon Polypropylene Mini Handy and Compact Ch...,appliances,All Appliances,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Pigeon-Stovekraft-Plasti...,4.1,274505,₹199,₹545,
2,2,Glun Multipurpose Portable Electronic Digital ...,appliances,All Appliances,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Multipurpose-Portable-El...,3.8,365,₹199,₹899,
3,3,beatXP Kitchen Scale Multipurpose Portable Ele...,appliances,All Appliances,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/beatXP-Multipurpose-Port...,3.7,3290,₹299,"₹1,999",
4,4,Bajaj DX-6 1000W Dry Iron with Advance Solepla...,appliances,All Appliances,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Bajaj-Majesty-1000-Watt-...,4.2,24380,₹625,"₹1,400",


In [33]:
df["img_path"]

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
9571    NaN
9572    NaN
9573    NaN
9574    NaN
9575    NaN
Name: img_path, Length: 9576, dtype: object

## csvの画像以外の前処理

In [34]:
def preprocessing(input_df):
    def price2yen(input_df):
        output = input_df.copy()
        output = output.dropna(subset=["actual_price"])
        output["actual_price"] = output["actual_price"].str.strip("₹")
        output["actual_price"] = output["actual_price"].str.replace(",","").astype(float)
        output["actual_price_yen"] = output["actual_price"] * 110
        return output
    
    def no_of_rate(input_df):
        output = input_df.copy()
        output = output.dropna(subset=["no_of_ratings"])
        #文字情報が含まれている場合、エラーが発生する可能性があるため、エラーハンドリングを追加
        output["no_of_ratings"] = pd.to_numeric(output["no_of_ratings"].str.replace(",",""), errors='coerce').fillna(0).astype(int)
        return output
    
    def img_path(input_df):
        output = input_df.copy()
        output = output.dropna(subset=["img_path"])
        return output
    
    output = price2yen(input_df)
    output = no_of_rate(output)
    output = img_path(output)
    return output

In [35]:
output_df = preprocessing(df)

In [36]:
len(output_df)

5916

In [39]:
output_df.head()

Unnamed: 0.1,Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,img_path,actual_price_yen
72,72,AmazonBasics High Speed 55 Watt Oscillating Pe...,appliances,All Appliances,https://m.media-amazon.com/images/I/71QfUcEOg8...,https://www.amazon.in/AmazonBasics-400mm-Pedes...,4.1,6113,"₹2,099",3300.0,/root/graduation_thetis/input/images/72.jpg,363000.0
73,73,Farberware Mini Blender Fruit Mixer Machine Po...,appliances,All Appliances,https://m.media-amazon.com/images/I/716mmFt0PG...,https://www.amazon.in/Farberware-Portable-Elec...,2.9,6071,₹499,1199.0,/root/graduation_thetis/input/images/73.jpg,131890.0
74,74,PHILIPS Handheld Garment Steamer STH3000/20 - ...,appliances,All Appliances,https://m.media-amazon.com/images/I/71W2XPQdBq...,https://www.amazon.in/PHILIPS-Handheld-Garment...,4.0,1553,"₹3,995",4095.0,/root/graduation_thetis/input/images/74.jpg,450450.0
75,75,"Cookwell Bullet Mixer Grinder (5 Jars, 3 Blade...",appliances,All Appliances,https://m.media-amazon.com/images/I/81yobRRV8n...,https://www.amazon.in/Cookwell-Bullet-Mixer-Gr...,4.1,9592,"₹2,479",6000.0,/root/graduation_thetis/input/images/75.jpg,660000.0
76,76,"Bajaj ATX 4 750-Watt Pop-up Toaster, 2-Slice A...",appliances,All Appliances,https://m.media-amazon.com/images/I/51D5T7TGVb...,https://www.amazon.in/Bajaj-ATX-750-Watt-Pop-u...,4.3,9520,"₹1,499",2250.0,/root/graduation_thetis/input/images/76.jpg,247500.0


In [41]:
output_df["img_path"].isnull().sum()

0

In [40]:
output_df.to_csv("./All Appliances_preprocess.csv",index = False)

## 画像部分のembedding作成

In [None]:
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

image_folder = '/root/graduation_thetis/input/images'

model = timm.create_model('resnet50', pretrained=True,num_classes = 0)
model.eval()

for i, row in df.iterrows():
    img_path = row["img_path"]
    if img_path != None:
        img = Image.open(img_path).convert("RGB")
        img_tensor = preprocess(img).unsqueeze(0)
        if torch.cuda.is_available():
            img_tensor = img_tensor.cuda()

        # 埋め込みを生成
        with torch.no_grad():
            embedding = model(img_tensor)
        
        # 埋め込みの形状を確認
        print(f"Embedding shape for {img_path}: {embedding.shape}")
        
        # 埋め込みをDataFrameに保存
        df.at[i, 'embedding'] = embedding.cpu().numpy() 

## embeddingと合わせて交絡を作成する

In [None]:
theta = 0.5
df["output"] = theta * df["price_ave"] + 0.3 * df["no_of_ratings"] + df["embedding"]

In [None]:
df.to_csv("./All Appliances_output.csv")