In [41]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
import subprocess
!pip install shap
!pip install --upgrade shap
!pip install geohash
!pip install contextily

try:
    import japanize_matplotlib
    print("japanize_matplotlib is already installed.")
except ImportError:
    !pip install japanize_matplotlib
    print("japanize_matplotlib has been installed.")

try:
    import catboost
    print("catboost is already installed.")
except ImportError:
    !pip install catboost
    print("catboost has been installed.")

try:
    import optuna
    print("optuna is already installed.")
except ImportError:
    !pip install optuna
    print("optuna has been installed.")

try:
    import geolib
    print("geolib is already installed.")
except ImportError:
    !pip install geolib
    print("geolib has been installed.")

# mkdirとechoコマンドの実行
try:
    subprocess.check_call('mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd', shell=True)
    print('Directory and file for OpenCL vendors have been created.')
except subprocess.CalledProcessError as e:
    print(f"Error running mkdir and echo command: {e}")

Collecting contextily
  Downloading contextily-1.6.2-py3-none-any.whl.metadata (2.9 kB)
Collecting mercantile (from contextily)
  Downloading mercantile-1.2.1-py3-none-any.whl.metadata (4.8 kB)
Collecting rasterio (from contextily)
  Downloading rasterio-1.3.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting affine (from rasterio->contextily)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting cligj>=0.5 (from rasterio->contextily)
  Downloading cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Collecting snuggs>=1.4.1 (from rasterio->contextily)
  Downloading snuggs-1.4.7-py3-none-any.whl.metadata (3.4 kB)
Collecting click-plugins (from rasterio->contextily)
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl.metadata (6.4 kB)
Downloading contextily-1.6.2-py3-none-any.whl (17 kB)
Downloading mercantile-1.2.1-py3-none-any.whl (14 kB)
Downloading rasterio-1.3.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.7 M

In [43]:
import os

def list_directories(root_dir):
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for dirname in dirnames:
            print(os.path.join(dirpath, dirname))

# 表示したいディレクトリを指定
root_directory = '/content/drive/MyDrive/rsna2024_model'  # 例: Google Drive内のディレクトリ
list_directories(root_directory)


/content/drive/MyDrive/rsna2024_model/efficientnet_b0_fold_0
/content/drive/MyDrive/rsna2024_model/convnext_base_fold_0
/content/drive/MyDrive/rsna2024_model/efficientnet_b7_fold_0
/content/drive/MyDrive/rsna2024_model/convnextv2_huge_fold_0


In [None]:
from sklearn.metrics import mean_absolute_error
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import minimize


import sys
sys.path.append('/content/drive/MyDrive/yuto/project_directory/code')

from utils import *

In [None]:
files = glob.glob(
    r"/content/drive/MyDrive/yuto/project_directory/data/competition_data/train/train/*.csv")
data_list = []
for file in files:
    data_list.append(pd.read_csv(file, index_col=0))
df = pd.concat(data_list)
df_copy=df.copy()
check_dataframe_elements(df)

df_test=pd.read_csv(r"/content/drive/MyDrive/yuto/project_directory/data/competition_data/test.csv",index_col=0)
df_test_copy=df_test.copy()


# カラム 'A' の欠損値の数を確認
tre_missing_values_count = df['最寄駅：名称'].isnull().sum()
tes_missing_values_count = df_test['最寄駅：名称'].isnull().sum()
print(f"学習データの欠損値の数: {tre_missing_values_count}")
print(f"テストデータの欠損値の数: {tes_missing_values_count}")

In [None]:
# データの前処理 ＆ 特徴量エンジニアリング
data_tree=ProcessData(df,df_test)
df_train_tree,df_test_tree,df_all_tree,df_geo,cat_cols=data_tree.return_data_test()

In [None]:
class Geocod_API:
    def __init__(self,df):
        self.df=df
        self.df_result=pd.DataFrame()
        self.df_search_results=None

    def make_unique_addresses(self):
        self.df["住所"]=self.df["都道府県名"].apply(lambda x:"" if pd.isna(x) else x)+self.df["市区町村名"].apply(lambda x:"" if pd.isna(x) else x)+self.df["地区名"].apply(lambda x:"" if pd.isna(x) else x)
        self.unique_addresses=self.df.copy()

        #ユニークな組合せを抽出
        self.unique_addresses["住所"]=self.unique_addresses["都道府県名"].apply(lambda x:"" if pd.isna(x) else x)+self.unique_addresses["市区町村名"].apply(lambda x:"" if pd.isna(x) else x)+self.unique_addresses["地区名"].apply(lambda x:"" if pd.isna(x) else x)
        #print(self.unique_addresses)
        self.unique_addresses=self.unique_addresses.drop_duplicates(subset=["住所"])

        print("ユニークなアドレスの数=>",len(self.unique_addresses))
        #print(self.unique_addresses)

    def search_addresses3(self):
        #self.unique_addresses=self.unique_addresses.reset_index()
        #print(self.unique_addresses.columns)
        print("ユニークなアドレスの数=>",len(self.unique_addresses))
        #print(address_list)
        #print(len(address_list))
        # 全ての検索結果を格納するリスト
        all_results = []
        # データを格納するリスト
        data_list = []

        # 各住所に対して検索を実行し、結果をデータフレームに追加
        for i,(index,row) in enumerate(tqdm_notebook(self.unique_addresses.iterrows(),total=len(self.unique_addresses))):
            address=row["住所"]
            # 住所をURLエンコード
            encoded_address = urllib.parse.quote(address)
            # APIのURL
            url = f"https://msearch.gsi.go.jp/address-search/AddressSearch?q={encoded_address}"
            # APIリクエストを送信
            response = requests.get(url)
            # レスポンスをJSON形式で取得
            results = response.json()
            #print("ループ=>",i)
            #print("入力されたアドレス=>",address)
            #print("国土地理院APIからの出力=>",results)
            #print("row",row)

            #全ての候補を保存
            if results:
                for result in results:
                    title=result['properties']['title']
                    #print(title)
                    coordinates = result['geometry']['coordinates']
                    #print(coordinates)
                    all_results.append({
                        '都道府県名':row["都道府県名"],
                        '市区町村名':row["市区町村名"],
                        '地区名':row["地区名"],
                        '最寄駅：名称':row["最寄駅：名称"],
                        '最寄駅：距離（分）':row["最寄駅：距離（分）"],
                        'address':address,
                        'title': title,
                        'x': coordinates[0],
                        'y': coordinates[1]})
                    #print(all_results)
            else:
                # 結果が空の場合でもデータを追加
                all_results.append({
                        '都道府県名':row["都道府県名"],
                        '市区町村名':row["市区町村名"],
                        '地区名':row["地区名"],
                        '最寄駅：名称':row["最寄駅：名称"],
                        '最寄駅：距離（分）':row["最寄駅：距離（分）"],
                        'address': address,
                        'title': None,
                        'x': None,
                        'y': None})

            # 各住所についてデータを取得
            if results:
                # 最初の結果のみを使用
                result = results[0]
                title = result['properties']['title']
                coordinates = result['geometry']['coordinates']
                data_list.append({
                        '都道府県名':row["都道府県名"],
                        '市区町村名':row["市区町村名"],
                        '地区名':row["地区名"],
                        '最寄駅：名称':row["最寄駅：名称"],
                        '最寄駅：距離（分）':row["最寄駅：距離（分）"],
                        'address':address,
                        'title': title,
                        'x': coordinates[0],
                        'y': coordinates[1]})
                #print(data_list)
            else:
                # 結果が空の場合でもデータを追加
                data_list.append({
                        '都道府県名':row["都道府県名"],
                        '市区町村名':row["市区町村名"],
                        '地区名':row["地区名"],
                        '最寄駅：名称':row["最寄駅：名称"],
                        '最寄駅：距離（分）':row["最寄駅：距離（分）"],
                        'address': address,
                        'title': None,
                        'x': None,
                        'y': None})

            #if i==100:
            #    break

        # データフレームに変換
        df_all_address = pd.DataFrame(all_results)
        print("全てのデータのデータフレーム")
        print(df_all_address)
        df_address=pd.DataFrame(data_list)
        print("データフレーム")
        print(df_address)

        return df_all_address,df_address

In [None]:
geocode=Geocod_API(df_all_tree)
geocode.make_unique_addresses()
df_all_address,df_address=geocode.search_addresses3()

In [None]:
df_address