**Import Essential Packages**

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset


**Training, Testing, Validation sets**

In [29]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

label_map = {
    "tetragonal": 0,
    "orthorhombic": 1,
    "trigonal": 2,
    "cubic": 3,
    "triclinic": 4,
    "monoclinic": 5,
    "hexagonal": 6
}

# 自定義 Dataset
class XRD_Dataset(Dataset):
    def __init__(self, file_list, data_dir):
        self.file_list = file_list
        self.data_dir = data_dir

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_name, label = self.file_list[idx]
        
        # 將 .cif 檔名更改為 .npz
        file_name = file_name.replace('.cif', '_convolved.npz')
        
        file_path = os.path.join(self.data_dir, file_name)
        data = np.load(file_path)
        x = np.stack((data['x_fine'], data['y_convolved']), axis=1).astype(np.float32)
        # Map string label to integer index
        label = label_map[label]
        return x, label

# 讀取 csv 文件
csv_path = "structure_info.csv"  # 替換為實際 csv 文件路徑
data_dir = "D:/OLD_DATA/main/NTU/third_grade/ML_PHYS/project/output_data/output_data"  # 包含 .npz 文件的目錄
df = pd.read_csv(csv_path)

# 分割數據集
split_info = {
    "tetragonal": (300, 300, 30),
    "orthorhombic": (500, 500, 50),
    "trigonal": (100, 100, 30),
    "cubic": (500, 500, 50),
    "triclinic": (100, 100, 30),
    "monoclinic": (500, 500, 50),
    "hexagonal": (100, 100, 5)
}

train_list, test_list, val_list = [], [], []

for label, (train_size, test_size, val_size) in split_info.items():
    files = df[df['cell_structure'] == label][['filename', 'cell_structure']].values.tolist()
    
    # 分割成訓練、測試、驗證集
    train_files, temp_files = train_test_split(files, train_size=train_size, random_state=42)
    test_files, temp_files = train_test_split(temp_files, train_size=test_size, random_state=42)
    val_files, temp_files = train_test_split(temp_files, train_size=val_size, random_state=42)
    
    train_list.extend(train_files)
    test_list.extend(test_files)
    val_list.extend(val_files)

# 創建 DataLoader
batch_size = 32

train_dataset = XRD_Dataset(train_list, data_dir)
test_dataset = XRD_Dataset(test_list, data_dir)
val_dataset = XRD_Dataset(val_list, data_dir)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print("數據加載完成：")
print(f"訓練集樣本數: {len(train_dataset)}")
print(f"測試集樣本數: {len(test_dataset)}")
print(f"驗證集樣本數: {len(val_dataset)}")


數據加載完成：
訓練集樣本數: 2100
測試集樣本數: 2100
驗證集樣本數: 245


**Define CNN Model**

In [30]:
import torch
import torch.nn as nn

class XRD_CNN(nn.Module):
    def __init__(self, num_classes):
        super(XRD_CNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=2, out_channels=32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.dropout = nn.Dropout(0.3)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(128 * 1062, 256)  # 1062 是 8500 經過 3 次 MaxPooling1d(2) 後的大小
        self.fc2 = nn.Linear(256, num_classes)  # `num_classes` 是 cell_structure 的種類數
    
    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = self.pool(x)
        x = torch.relu(self.conv2(x))
        x = self.pool(x)
        x = torch.relu(self.conv3(x))
        x = self.pool(x)
        x = self.dropout(x)
        x = self.flatten(x)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)  # 最後一層輸出 logits
        return x


**Training**

In [31]:
# 定義類別數量（如晶系數量）
num_classes = 7

# 初始化模型
model = XRD_CNN(num_classes=num_classes)
print(model)

# 定義損失函數與優化器
criterion = nn.CrossEntropyLoss()  # 用於分類問題
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 訓練範例
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        
        for inputs, labels in train_loader:
            inputs = inputs.permute(0, 2, 1)  # (batch_size, 8500, 2) -> (batch_size, 2, 8500)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            print(f"Loss = {loss.item()}")
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader):.4f}")


XRD_CNN(
  (conv1): Conv1d(2, 32, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(32, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv3): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.3, inplace=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=135936, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=7, bias=True)
)


In [32]:
epochs = 20
# 開始訓練
train_model(model, train_loader, criterion, optimizer, num_epochs=epochs)

Loss = 1.8737156391143799
Loss = 86.90399169921875
Loss = 85.5404052734375
Loss = 80.49240112304688
Loss = 66.21749877929688
Loss = 41.86954116821289
Loss = 23.678503036499023
Loss = 9.794734001159668
Loss = 4.6292643547058105
Loss = 2.2348783016204834
Loss = 1.5911495685577393
Loss = 2.0745062828063965
Loss = 1.8824174404144287
Loss = 3.565629243850708
Loss = 2.4234955310821533
Loss = 2.167215585708618
Loss = 2.1616640090942383
Loss = 1.978990077972412
Loss = 1.9053982496261597
Loss = 2.0071120262145996
Loss = 1.984639286994934
Loss = 1.8313980102539062
Loss = 1.9406851530075073
Loss = 1.696266770362854
Loss = 1.7061768770217896
Loss = 1.7264364957809448
Loss = 1.9489901065826416
Loss = 1.7845884561538696
Loss = 1.773022174835205
Loss = 1.7658648490905762
Loss = 1.8151524066925049
Loss = 1.6436835527420044
Loss = 1.8170974254608154
Loss = 1.7820680141448975
Loss = 1.6926569938659668
Loss = 1.5802555084228516
Loss = 1.6820626258850098
Loss = 1.5848362445831299
Loss = 1.9022822380065918

**Evaluation**

In [33]:
def evaluate_model(model, data_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs = inputs.permute(0, 2, 1)  # (batch_size, 8500, 2) -> (batch_size, 2, 8500)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Accuracy: {100 * correct / total:.2f}%")

In [34]:
# 在驗證和測試集上評估
evaluate_model(model, val_loader)
evaluate_model(model, test_loader)

Accuracy: 20.41%
Accuracy: 23.81%


In [36]:
torch.save(model, "\models\CNN.pt")

  torch.save(model, "\models\CNN.pt")
  torch.save(model, "\models\CNN.pt")


RuntimeError: Parent directory \models does not exist.