In [1]:
import pandas as pd

#xyz파일을 구문 분석할 함수를 정의
def parse_xyz(filepath):
    molecules=[]
    current_molecule=[]
    
    #파일을 열고 한 줄씩 읽기
    with open(filepath,'r') as file:
        while True:
            #분자의 원자수를 읽어라
            line=file.readline()
            if not line:
                break #EDF
            num_atoms=int(line.strip())
            
            #추가 데이터 포함
            comment=file.readline().strip()
            
             # 이 분자의 데이터를 저장하기 위해 목록을 초기화합니다
            atoms = []
            coordinates = []
            
            #각 원자의 데이터를 읽는다
            for _ in range(num_atoms):
                atom_data=file.readline().strip().split()
                atoms.append(atom_data[0])
                coordinates.append([float(coord) for coord in atom_data[1:4]])
                
                #분자 데이터 저장
                current_molecule={
                    'num_atoms':num_atoms,
                    'comment':comment,
                    'atoms':atoms,
                    'coordinates':coordinates
                }
                molecules.append(current_molecule)
            
        return molecules
                
                
                

In [2]:
# 훈련데이터 
train_data = parse_xyz('C:/Users/82106/Desktop/데이터분석 프로젝트/삼성 경진대회/open/train.xyz')

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# 데이터 세트에서 특징 및 레이블 추출
def extract_features_labels(molecules):
    features = []
    labels = []
    
    for molecule in molecules:
        # 좌표와 힘을 하나의 리스트로 평탄화합니다
        flat_coordinates = np.array(molecule['coordinates']).flatten()
        energy = float(molecule['comment'].split()[5])
        
        features.append(flat_coordinates)
        labels.append(energy)
        
    return features, labels
# train data에 energy 에서 comment 평탄화한 특성 추가하기
features, labels = extract_features_labels(train_data)
    

In [4]:
#max_features=최대 len (features 만큼 반복)
max_features=max(len(feature) for feature in features)

# pad features numpy 기능 정의
def pad_features_numpy(features, max_features_length):
    #padded_features = 0, max 특성에서 feature 길이를 뺀것 -> pad 처리
    padded_features= [np.pad(feature,
                            (0,max_features_length-len(feature)),
                            'constant',constant_values=0)
                      for feature in features]
    return np.array(padded_features)

#새로운 pad features numpy 추가
padded_features=pad_features_numpy(features,max_features)



In [5]:
#데이터 분할
X_train,X_test,y_train,y_test=train_test_split(features,
                                              labels,
                                              test_size=0.2)
len(X_train),len(X_test),len(y_train),len(y_test)

(1027980, 256995, 1027980, 256995)

In [6]:
#랜덤 포레스트 생성
rf_model=RandomForestRegressor(n_estimators=10,max_depth=5,
                               random_state=42)
#각 리스트를 numpy로 변환
X_train=np.array(X_train)
X_test=np.array(X_test)
y_train=np.array(y_train)
y_test=np.array(y_test)

  X_train=np.array(X_train)
  X_test=np.array(X_test)


In [7]:
#데이터 크기를 10%로 낮추기 
sample_size= int(0.1 * len(X_train))

#indices 변수에 sample_size 개수의 랜덤한 데이터 포인트의 인덱스가 저장
np.random.seed(42)
indices=np.random.choice(range(len(X_train)),
                        sample_size,
                        replace=False)

X_train_sample=X_train[indices]
y_train_sample=y_train[indices]

In [8]:
len(X_train_sample),len(y_train_sample)

(102798, 102798)

In [9]:
most_common_length = 48

#X_train_sample의 각 데이터 포인트를 패딩 또는 자르기를 사용하여 동일한 길이로 조정하는 작업
X_train_sample_adjusted=np.array([np.pad(x,(0,most_common_length-len(x)),
                                        'constant') if len(x)<most_common_length else x[:most_common_length] for x in X_train_sample])
# X_test의 각 데이터 포인트를 조정
X_test_adjusted = np.array([np.pad(x, (0, max(0, most_common_length - len(x))), 'constant', constant_values=0)
                            if len(x) < most_common_length 
                            else x[:most_common_length] 
                            for x in X_test])

In [10]:
# 길이를 맞춘 X_train_sample_adjusted 로 학습하기
rf_model.fit(X_train_sample_adjusted,y_train_sample)

RandomForestRegressor(max_depth=5, n_estimators=10, random_state=42)

In [11]:
#예측 진행
predictions=rf_model.predict(X_test_adjusted)

In [12]:
rmse=np.sqrt(mean_squared_error(y_test,predictions))
rmse

0.044951758470363305

### CatBoost

In [13]:
!pip install catboost



In [14]:
from catboost import CatBoostRegressor

X_train,X_test,y_train,y_test=train_test_split(features,
                                              labels,
                                              test_size=0.2,
                                              random_state=123)

len(X_train), len(X_test), len(y_train), len(y_test)


(1027980, 256995, 1027980, 256995)

In [15]:
catboost=CatBoostRegressor(verbose=False,random_seed=42)

X_train=np.array(X_train)
X_test=np.array(X_test)
y_train=np.array(y_train)
y_test=np.array(y_test)

  X_train=np.array(X_train)
  X_test=np.array(X_test)


In [16]:
sample_size=int(0.1*len(X_train))

#sample_size만큼 선택하기
np.random.seed(42)
indices=np.random.choice(range(len(X_train)),sample_size,
                        replace=False)
X_train_sample=X_train[indices]
y_train_sample=y_train[indices]

len(X_train_sample),len(y_train_sample)

(102798, 102798)

In [19]:
#패딩과 잘라내기를 통해 데이터의 일관된 형식을 유지하거나, 특정 길이로 데이터를 정규화
X_train_sample_adjusted=np.array([np.pad(x,
                                        (0,
                                        most_common_length-len(x)),
                                         'constant') if len(x)<most_common_length else x[:most_common_length] for x in X_train_sample])

catboost.fit(X_train_sample_adjusted, y_train_sample)

<catboost.core.CatBoostRegressor at 0x221833c69a0>

In [20]:
predictions=catboost.predict(X_test_adjusted)



In [22]:
rmse=np.sqrt(mean_squared_error(y_test,predictions))
rmse

0.28137971750444507

In [24]:
predictions

array([-2.04416061e-04, -9.21128779e-05, -2.41823537e-03, ...,
       -5.22853412e-04, -3.47931303e-03,  2.38218770e-04])