## preprocess.ipynb
これはcsvファイルの前処理用のNotebookです

In [1]:
import sys
sys.path.append("../../")
sys.path.append("../../../")

In [None]:
import os
from PIL import Image
from torchvision import transforms
import pandas as pd
import torch
import timm
import pickle
from torch import nn

In [3]:
df = pd.read_csv("./watch_img.csv")

In [None]:
df.head()

In [None]:
df["img_path"]

## csvの画像以外の前処理

In [8]:
def preprocessing(input_df):
    def price2yen(input_df):
        output = input_df.copy()
        output = output.dropna(subset=["actual_price"])
        output["actual_price"] = output["actual_price"].str.strip("₹")
        output["actual_price"] = output["actual_price"].str.replace(",","").astype(float)
        output["actual_price_yen"] = output["actual_price"] * 110
        return output
    
    def no_of_rate(input_df):
        output = input_df.copy()
        output = output.dropna(subset=["no_of_ratings"])
        #文字情報が含まれている場合、エラーが発生する可能性があるため、エラーハンドリングを追加
        output["no_of_ratings"] = pd.to_numeric(output["no_of_ratings"].str.replace(",",""), errors='coerce').fillna(0).astype(int)
        return output
    
    def img_path(input_df):
        output = input_df.copy()
        output = output.dropna(subset=["img_path"])
        return output
    
    output = price2yen(input_df)
    output = no_of_rate(output)
    output = img_path(output)
    return output

In [9]:
output_df = preprocessing(df)

In [None]:
len(output_df)

In [None]:
output_df.head()

In [None]:
output_df["img_path"].isnull().sum()

In [14]:
output_df.to_csv("./watch_preprocess.csv",index = False)

## 画像部分のembedding作成

In [56]:
output_df = pd.read_csv("./All Appliances_preprocess.csv")

In [None]:
output_df.head()

In [None]:
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

image_folder = '/root/graduation_thetis/causal-bert-pytorch/input/images'
embedding_folder = '/root/graduation_thetis/causal-bert-pytorch/input/embeddings'

model = timm.create_model('resnet50', pretrained=True,num_classes = 0)
model.eval()

for i, row in output_df.iterrows():
    img_path = row["img_path"]
    if img_path != None:
        img = Image.open(img_path).convert("RGB")
        img_tensor = preprocess(img).unsqueeze(0)
        if torch.cuda.is_available():
            model.cuda()
            img_tensor = img_tensor.cuda()

        # Generate embedding
        with torch.no_grad():
            embedding = model(img_tensor)
        
        # Convert embedding to numpy and save as pickle file
        embedding_np = embedding.cpu().numpy()
        embedding_path = os.path.join(embedding_folder, f"embedding_{i}.pkl")
        
        with open(embedding_path, 'wb') as f:
            pickle.dump(embedding_np, f)
        
        # Save the path of the embedding file to the DataFrame
        output_df.at[i, 'embedding_path'] = embedding_path

        # Print for confirmation
        print(f"Saved embedding for {img_path} at {embedding_path}")

In [59]:
output_df.to_csv("preprocessed.csv",index = False)

## embeddingと合わせて交絡を作成する

In [1]:
import pickle

In [5]:
df = pd.read_csv("preprocessed.csv")

In [6]:
# 2048次元の埋め込みベクトルを1次元に縮約
class EmbeddingReducer(nn.Module):
    def __init__(self):
        super(EmbeddingReducer, self).__init__()
        self.fc = nn.Linear(2048, 1)

    def forward(self, x):
        return self.fc(x)

In [None]:
embedding

In [None]:
import numpy as np
for i,row in df.iterrows():
    with open(row['embedding_path'], 'rb') as f:
        embedding = pickle.load(f)
        scaler = np.dot(embedding,embedding.T)
        print(scaler.shape)
        df.at[i,"embedding"] = scaler
scaler

In [None]:
df["embedding"].mean()

## 処置変数を作成する

In [None]:
output_df["actual_price_yen"].max()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 箱ひげ図の作成
plt.figure(figsize=(10, 6))
sns.boxplot(x=output_df["actual_price_yen"])
plt.title('Actual Price Yen Distribution - Box Plot')
plt.xlabel('Actual Price Yen')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 四分位範囲（IQR）の計算
Q1 = output_df["actual_price_yen"].quantile(0.25)
Q3 = output_df["actual_price_yen"].quantile(0.75)
IQR = Q3 - Q1

# 外れ値の閾値を設定
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# 外れ値を除去
filtered_df = output_df[(output_df["actual_price_yen"] >= lower_bound) & (output_df["actual_price_yen"] <= upper_bound)]

# 箱ひげ図の作成
plt.figure(figsize=(10, 6))
sns.boxplot(x=filtered_df["actual_price_yen"])
plt.title('Actual Price Yen Distribution - Box Plot (Outliers Removed)')
plt.xlabel('Actual Price Yen')
plt.show()

In [None]:
# "actual_price_yen"の平均を計算
mean_price = filtered_df["actual_price_yen"].mean()

# "price_ave"列を追加
filtered_df["price_ave"] = filtered_df["actual_price_yen"].apply(lambda x: 1 if x > mean_price else 0)

In [None]:
filtered_df["price_ave"].value_counts()

In [27]:
df = filtered_df.copy()

In [45]:
theta = 0.5
df["output"] = theta * df["price_ave"] + 0.3 * df["no_of_ratings"] + df["embedding"]

In [29]:
df.to_csv("./watch_outputs.csv")

## outputsを2値分類する必要ある? 

In [30]:
df = pd.read_csv("./watch_outputs.csv")

In [None]:
df.head()

In [7]:
mean = df["output"].mean()

df["output_2v"] = df["output"].apply(lambda x : 1 if x > mean else 0)

In [None]:
df["output_2v"].value_counts()

In [None]:
import matplotlib.pyplot as plt
plt.hist(df["output"])

In [15]:
df.to_csv("outputs_v2.csv")

## 2値分類の方式を変える
画像の白っぽいor黒っぽいを交絡変数としてみる

In [32]:
df = pd.read_csv("/root/graduation_thetis/causal-bert-pytorch/input/watch_outputs.csv")

In [33]:
from PIL import Image
import numpy as np

def is_dark_or_light(image_path, threshold=160):
    # 画像を読み込んでRGBに変換
    img = Image.open(image_path).convert('RGB')
    
    # 画像をNumPy配列に変換
    img_np = np.array(img)
    
    # 輝度の計算 (R, G, B の加重平均)
    brightness = 0.299 * img_np[:,:,0] + 0.587 * img_np[:,:,1] + 0.114 * img_np[:,:,2]
    
    # 画像全体の平均輝度を計算
    avg_brightness = np.mean(brightness)
    
    print(f"平均輝度: {avg_brightness}")
    
    # 平均輝度が閾値より低ければ「黒っぽい」、高ければ「白っぽい」
    if avg_brightness < threshold:
        print("画像は黒っぽいです。")
        return "dark"
    else:
        print("画像は白っぽいです。")
        return "light"
    

def brightness(image_path, threshold=160):
    # 画像を読み込んでRGBに変換
    img = Image.open(image_path).convert('RGB')
    
    # 画像をNumPy配列に変換
    img_np = np.array(img)
    
    # 輝度の計算 (R, G, B の加重平均)
    brightness = 0.299 * img_np[:,:,0] + 0.587 * img_np[:,:,1] + 0.114 * img_np[:,:,2]
    
    # 画像全体の平均輝度を計算
    avg_brightness = np.mean(brightness)
    
    print(f"平均輝度: {avg_brightness}")
    
    return avg_brightness



In [None]:
df["brightness"] = df["img_path"].apply(brightness)

In [None]:
df["light_or_dark"] = df["img_path"].apply(is_dark_or_light)

In [39]:
df["light_or_dark"] = df["light_or_dark"].apply(lambda x : 1 if x == "light" else 0)

In [None]:
df["light_or_dark"].value_counts()

In [40]:
df.to_csv("watch_train.csv",index = None, )

In [None]:
import matplotlib.pyplot as plt
plt.hist(df["brightness"])

In [None]:
df

In [None]:
df.light_or_dark.value_counts()

In [None]:
for i, row in df[df["light_or_dark"] == "dark"].iterrows():
    img_path = row["img_path"]
    print(img_path)
    Image.open(img_path)