# Machine Learning for Sustainable Systems Final Project


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset, SubsetRandomSampler, ConcatDataset
import os
from PIL import Image
import pandas as pd
import random
from torch.utils.data import random_split

In [4]:
torch.manual_seed(0)
oilTrainLabels = pd.read_csv('/Volumes/LaCie/MIT/MLSS Final Project/Oil_Palm_Kaggle/traininglabels.csv')

In [27]:
# Note: this portion utilized ChatGPT
class OilPalmDataset(Dataset):
    def __init__(self, csv, transform):
        self.data = pd.read_csv(csv)
        self.transform = transform

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        image_path = os.path.join('/Volumes/LaCie/MIT/MLSS Final Project/Oil_Palm_Kaggle/train_images/'+self.data.loc[idx,'image_id'])
        image = Image.open(image_path)
        image = self.transform(image)
        label = "oil_palm" if self.data.loc[idx,'has_oilpalm'] == 1 else "not_oil_palm"
        return {'image': image, 'labels': label}

In [30]:
class LCDataset(Dataset):
    def __init__(self, csv, transform):
        self.data = pd.read_csv(csv)
        self.transform = transform

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        image_path = os.path.join('/Volumes/LaCie/archive/planet/planet/train-jpg'+self.data.loc[idx,'image_name'])
        image = Image.open(image_path)
        image = self.transform(image)
        label = self.data.loc[idx, 'tags']
        return {'image': image, 'labels': label}

In [7]:
# This is a general class to import any of the images, note ChatGPT was consulted in the creation of this code
class LandCoverDataset(Dataset):
    def __init__(self, csv, folder, transform, label_column, image_id_column):
        self.data = pd.read_csv(csv)
        self.transform = transform
        self.folder = folder
        self.label_column = label_column
        self.image_id_column = image_id_column
        
        if self.label_column == 'has_oilpalm':
            self.data[self.label_column] = self.data[self.label_column].map({1: 'oil_palm', 0: 'not_oil_palm'})
        elif self.label_column == 'tags':
            self.land_cover_types = self.data[self.label_column].unique()
            self.categories = self.categorize_land_cover(self.land_cover_types)
            self.data[self.label_column] = self.data[self.label_column].apply(self.adjust_label)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
            image_path = os.path.join(self.folder, self.data.loc[idx, self.image_id_column])
            print("ogimg",image_path)
            print(self.image_id_column)
            if self.image_id_column == 'image_id':
                image_path = image_path[:-8]
            image_path += ".jpg"
            print("newimg",image_path)
            if os.path.exists(image_path):
                image = Image.open(image_path)
                image = self.transform(image)
                label = self.data.loc[idx, self.label_column]
                if self.label_column == 'tags':
                    label = self.data.loc[idx, self.label_column]
                    adjusted_label = self.adjust_label(label)
                    return {'image': image, 'labels': adjusted_label}
                else:
                    return {'image': image, 'labels': label}

    def subset(self, labels):
        # Note that this irreversibly changes the input-- that is ok for my application
        self.data = self.data[self.data[self.label_column].isin(labels)]
        return self

    def categorize_land_cover(self, land_cover_types):
        categories = []
        for land_cover in land_cover_types:
            if 'selective_logging' in land_cover:
                categories.append('selective_logging')
            elif 'agriculture' in land_cover:
                categories.append('agriculture')
            elif 'cultivation' in land_cover:
                categories.append('agriculture')
            elif 'habitation' in land_cover:
                categories.append('habitation')
            elif 'blow_down' in land_cover:
                categories.append('blow_down')
            elif 'road' in land_cover:
                categories.append('road')
            elif 'primary' in land_cover:
                categories.append('primary')
            else:
                categories.append('other')
        return categories
    
    def adjust_label(self, label):
        if 'selective_logging' in label:
            return 'selective_logging'
        elif 'agriculture' in label or 'cultivation' in label:
            return 'agriculture'
        elif 'habitation' in label:
            return 'habitation'
        elif 'blow_down' in label:
            return 'blow_down'
        elif 'road' in label:
            return 'road'
        elif 'primary' in label:
            return 'primary'
        else:
            return 'other'

    def print_labels(self):
        print(self.data[self.label_column].unique())



In [8]:
mlaa_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.Lambda(lambda img: img.convert('RGB')),  
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

In [9]:
trainingOil_dataset = LandCoverDataset('/Volumes/LaCie/MIT/MLSS Final Project/Oil_Palm_Kaggle/traininglabels.csv','/Volumes/LaCie/MIT/MLSS Final Project/Oil_Palm_Kaggle/train_images', mlaa_transform, 'has_oilpalm', 'image_id')

In [558]:
sample = trainingOil_dataset[11000]
#len(trainingOil_dataset)
sample

ogimg /Volumes/LaCie/MIT/MLSS Final Project/Oil_Palm_Kaggle/train_images/img_050892018.jpg
image_id
newimg /Volumes/LaCie/MIT/MLSS Final Project/Oil_Palm_Kaggle/train_images/img_05089.jpg


{'image': tensor([[[-1.4158, -1.4672, -1.3815,  ..., -1.2959, -1.0904, -0.9192],
          [-1.3644, -1.3987, -1.2788,  ..., -1.3987, -1.4158, -1.3987],
          [-1.2103, -1.1247, -0.8335,  ..., -1.3473, -1.5014, -1.5528],
          ...,
          [ 1.2557,  1.3413,  1.2214,  ...,  0.1768, -0.2342, -0.5767],
          [ 0.7077,  0.9303,  1.2385,  ...,  0.1083, -0.1828, -0.5253],
          [ 0.1426,  0.5364,  1.1872,  ...,  0.1597,  0.0569, -0.4226]],
 
         [[-1.2829, -1.3704, -1.2654,  ..., -1.1429, -0.9153, -0.7227],
          [-1.2479, -1.3354, -1.2479,  ..., -1.2479, -1.2129, -1.1604],
          [-1.1604, -1.1078, -0.8978,  ..., -1.2129, -1.3004, -1.2654],
          ...,
          [ 0.6254,  0.7479,  0.6954,  ..., -0.0049, -0.2850, -0.5301],
          [ 0.2052,  0.4153,  0.6954,  ..., -0.0924, -0.2675, -0.4951],
          [-0.2850,  0.0476,  0.6429,  ..., -0.0574, -0.0399, -0.4251]],
 
         [[-1.3339, -1.4036, -1.3339,  ..., -1.3861, -1.1944, -1.0027],
          [-1.4384,

In [559]:
trainingOil_dataset.print_labels()

['not_oil_palm' 'oil_palm']


In [560]:
trainingLC_dataset = LandCoverDataset('/Volumes/LaCie/archive/planet/planet/train_classes.csv', '/Volumes/LaCie/archive/planet/planet/train-jpg', mlaa_transform, 'tags', 'image_name')

In [561]:
len(trainingLC_dataset)

40479

In [562]:
trainingLC_dataset.print_labels()

['primary' 'agriculture' 'other' 'road' 'habitation' 'selective_logging'
 'blow_down']


In [563]:
trainingLC_dataset[1]

ogimg /Volumes/LaCie/archive/planet/planet/train-jpg/train_1
image_name
newimg /Volumes/LaCie/archive/planet/planet/train-jpg/train_1.jpg


{'image': tensor([[[-1.2959, -1.2274, -1.1247,  ..., -1.3302, -1.2788, -1.1589],
          [-1.2445, -1.1932, -1.1247,  ..., -1.3644, -1.4158, -1.3473],
          [-1.1932, -1.1760, -1.1418,  ..., -1.3302, -1.4158, -1.4329],
          ...,
          [-0.5253, -0.4739, -0.4226,  ..., -0.9020, -0.7993, -0.6623],
          [-0.5424, -0.4911, -0.4226,  ..., -0.7822, -0.6623, -0.5424],
          [-0.5767, -0.4911, -0.4226,  ..., -0.6965, -0.5767, -0.4568]],
 
         [[-0.9678, -0.9153, -0.8277,  ..., -1.0728, -1.0553, -0.9503],
          [-0.9503, -0.8978, -0.8277,  ..., -1.1253, -1.1253, -1.0203],
          [-0.9153, -0.8803, -0.8452,  ..., -1.1078, -1.1429, -1.0903],
          ...,
          [-0.3025, -0.2500, -0.1975,  ..., -0.5826, -0.4776, -0.4251],
          [-0.3375, -0.2850, -0.2325,  ..., -0.4951, -0.3901, -0.3200],
          [-0.3550, -0.3025, -0.2325,  ..., -0.4601, -0.3550, -0.2850]],
 
         [[-0.8458, -0.8110, -0.7587,  ..., -0.9504, -0.9156, -0.8110],
          [-0.7936,

In [564]:
oilPalmOnly = trainingOil_dataset.subset(['oil_palm'])

In [565]:
print(len(oilPalmOnly))
oilPalmOnly.print_labels()

942
['oil_palm']


In [566]:
relevantLandUse = trainingLC_dataset.subset(['primary', 'road','selective_logging'])

# Create indices for the dataset
indices = list(range(len(relevantLandUse)))

# Shuffle the indices randomly
random.shuffle(indices)

total_size = len(relevantLandUse)
train_size = int(0.8 * total_size)  # 80% for training
test_size = total_size - train_size 
# Create DataLoader for training and testing
LC_train, LC_test = random_split(relevantLandUse, [train_size, test_size])

In [567]:
LC_train.dataset


<__main__.LandCoverDataset at 0x2858ab0e0>

In [590]:
class ConcatenatedLandCoverDataset(Dataset):
    def __init__(self, dataset1, dataset2):
        self.dataset1 = dataset1
        self.dataset2 = dataset2
        self.length = len(dataset1) + len(dataset2)

    def __getitem__(self, idx):
        if idx < len(self.dataset1):
            return self.dataset1[idx]
        else:
            return self.dataset2[idx - len(self.dataset1)]

    def __len__(self):
        return len(self.dataset1) + len(self.dataset2)

    def dropna(self, labels):
        # Note that this irreversibly changes the input-- that is ok for my application
        self.data = self.data[self.data[self.label_column].isin(labels)]
        return self

# Then create the concatenated dataset like this:
concatenated_train_dataset = ConcatenatedLandCoverDataset(LC_train.dataset, oilPalmOnly)

In [591]:
concatenated_train_dataset[1000]

ogimg /Volumes/LaCie/archive/planet/planet/train-jpg/train_1000
image_name
newimg /Volumes/LaCie/archive/planet/planet/train-jpg/train_1000.jpg


{'image': tensor([[[-0.8335, -0.8335, -0.7993,  ..., -0.9020, -0.9363, -0.9020],
          [-0.7822, -0.8164, -0.8164,  ..., -0.9020, -0.9534, -0.9192],
          [-0.7137, -0.7650, -0.7822,  ..., -0.9020, -0.9534, -0.9363],
          ...,
          [ 0.4851,  0.4508,  0.3994,  ..., -0.9363, -0.9020, -0.8507],
          [ 0.4679,  0.4508,  0.3994,  ..., -0.9534, -0.9192, -0.8849],
          [ 0.4337,  0.4508,  0.3994,  ..., -0.9877, -0.9705, -0.9192]],
 
         [[-0.4426, -0.4776, -0.4776,  ..., -0.5826, -0.5826, -0.5126],
          [-0.4076, -0.4601, -0.4426,  ..., -0.6001, -0.6176, -0.5476],
          [-0.3725, -0.4251, -0.4076,  ..., -0.6176, -0.6352, -0.5651],
          ...,
          [ 0.6779,  0.7129,  0.6954,  ..., -0.6001, -0.5651, -0.5301],
          [ 0.6954,  0.7129,  0.6779,  ..., -0.6001, -0.5826, -0.5476],
          [ 0.7129,  0.7304,  0.6604,  ..., -0.6001, -0.5826, -0.5476]],
 
         [[-0.3404, -0.2532, -0.2010,  ..., -0.3230, -0.3055, -0.2532],
          [-0.2881,

In [592]:
relevantLandUse[2000]
relevantLandUse.print_labels()
len(relevantLandUse)

ogimg /Volumes/LaCie/archive/planet/planet/train-jpg/train_2000
image_name
newimg /Volumes/LaCie/archive/planet/planet/train-jpg/train_2000.jpg
['primary' 'road' 'selective_logging']


23883

In [593]:
oilTest = LandCoverDataset('/Volumes/LaCie/MIT/MLSS Final Project/Oil_Palm_Kaggle/testlabels.csv','/Volumes/LaCie/MIT/MLSS Final Project/Oil_Palm_Kaggle/leaderboard_test_data', mlaa_transform, 'has_oilpalm', 'image_id')

In [594]:
oilTestP = oilTest.subset(['oil_palm'])
oilTestP[1]
concatenated_test_dataset = ConcatenatedLandCoverDataset(LC_test.dataset, oilTestP)

ogimg /Volumes/LaCie/MIT/MLSS Final Project/Oil_Palm_Kaggle/leaderboard_test_data/img_048172017.jpg
image_id
newimg /Volumes/LaCie/MIT/MLSS Final Project/Oil_Palm_Kaggle/leaderboard_test_data/img_04817.jpg


In [595]:
concatenated_test_dataset[10000]

ogimg /Volumes/LaCie/archive/planet/planet/train-jpg/train_10000
image_name
newimg /Volumes/LaCie/archive/planet/planet/train-jpg/train_10000.jpg


{'image': tensor([[[-1.3644, -1.3644, -1.3473,  ..., -1.3473, -1.2788, -1.2274],
          [-1.3130, -1.3473, -1.3644,  ..., -1.2959, -1.2274, -1.1932],
          [-1.2959, -1.3644, -1.4158,  ..., -1.2788, -1.2103, -1.2274],
          ...,
          [-1.1932, -1.2445, -1.3302,  ..., -1.3302, -1.3130, -1.2959],
          [-1.2103, -1.2445, -1.3130,  ..., -1.2959, -1.3130, -1.3130],
          [-1.2445, -1.2274, -1.2788,  ..., -1.2788, -1.3473, -1.3815]],
 
         [[-0.8978, -0.8627, -0.8978,  ..., -0.9503, -0.8978, -0.8277],
          [-0.8627, -0.8803, -0.9153,  ..., -0.9853, -0.8978, -0.8102],
          [-0.8627, -0.9328, -0.9678,  ..., -1.0028, -0.8978, -0.8102],
          ...,
          [-0.7752, -0.8277, -0.8627,  ..., -0.9153, -0.9153, -0.9153],
          [-0.7927, -0.8277, -0.8803,  ..., -0.8452, -0.8803, -0.9328],
          [-0.7927, -0.8277, -0.8803,  ..., -0.7927, -0.8627, -0.9153]],
 
         [[-0.9853, -0.9678, -1.0027,  ..., -0.9853, -0.9678, -0.9330],
          [-0.9853,

In [596]:
batch_size = 4
concat_train_loader = DataLoader(concatenated_train_dataset, batch_size = batch_size)
concat_test_loader = DataLoader(concatenated_test_dataset, batch_size = batch_size)

In [612]:
dropna(concatenated_train_dataset)

NameError: name 'dropna' is not defined

ogimg /Volumes/LaCie/archive/planet/planet/train-jpg/train_0
image_name
newimg /Volumes/LaCie/archive/planet/planet/train-jpg/train_0.jpg


KeyError: 1

In [611]:
concatenated_train_dataset[12]

KeyError: 12

In [11]:
import shutil
def move_files(source_folder, destination_folder):
    # Get a list of all files in the source folder
    files = os.listdir(source_folder)
    
    # Iterate through each file and move it to the destination folder
    for file in files:
        # Construct the full file paths
        source_file_path = os.path.join(source_folder, file)
        destination_file_path = os.path.join(destination_folder, file)
        
        # Move the file to the destination folder
        shutil.move(source_file_path, destination_file_path)
move_files('/Volumes/LaCie/MIT/MLSS Final Project/Oil_Palm_Kaggle/leaderboard_test_data', '/Volumes/LaCie/MIT/MLSS Final Project/datasets_apr22/test-data')