### Project Road Map ###

0. Install Anaconda/Miniconda if you have not and set it up to use it as your kernel.

1. Obtain and configure a pre-trained CNN

2. Import and configure a google streetview dataset

3. Create helper functions to allow for more efficient model training and experimentation and install weights and biases visualizer to integrate in functions.

4. Run, compare, and bugfix to attain < 1 km accuracy

## 1. Obtain and configure a pre-trained CNN ##

Import a CNN Model from Pytorch Vision

In [3]:
import torch
print(torch.__file__)
import torchvision
%matplotlib inline
import matplotlib.pyplot as plt
from torch import nn
from torchvision import transforms
from torchvision.models import resnet50, ResNet50_Weights
from torchinfo import summary

#this makes it so that if your gpu is available the model will use it
device = "cuda" if torch.cuda.is_available() else "cpu"

#create model as a child of nn.Module
class GeoClassifier(nn.Module):
    def __init__ (self,num_countries,num_regions,num_subregions):
        super().__init__()
        
        #load resnet backbone and remove imagenet classifier, replacing it with nn.Identity which acts as a dummy layer
        self.backbone = resnet50(weights = ResNet50_Weights)
        self.backbone.fc = nn.Identity()
        
        #freeze feature layers
        for param in self.backbone.parameters():
            param.requires_grad=False
        
        
        #create a shared fully connected layer to reduce dimensionality (make it easier for the model to learn patterns, may revisit later)
        self.shared_fc = nn.Linear(2048,512)
        
        #create three hierarchical classification heads that rely on the previous label, this is our guessing output!
        self.country_head = nn.Linear(512, num_countries) #takes in the 512 inputs spit out by the previous layer and selects from the number of countries we have
        self.region_head = nn.Linear(512 + num_countries, num_regions) #makes it so that we can feed this layer the 512 values that come from the features layer, as well as the result of the countries label
        self.subregion_head = nn.Linear(512 + num_regions, num_subregions)
        
        #add a final regression head that predicts the lat and lon the photo is offset from the selected subregion
        self.coord_regressor = nn.Linear(512 + num_subregions, 2)
        
    def forward(self, x):
        
        #extract image features from the resnet backbone
        x = self.backbone(x)
        x = self.shared_fc(x)
        
        #predict country
        country_logits = self.country_head(x)
        country_soft = torch.softmax(country_logits, dim = 1) #converting for a logits (numeric value) to a probability, these are used internally to influence the next guess, but not exported
        
        #predict region conditioned on country
        region_input = torch.cat([x,country_soft], dim = 1) #concatenates the features pulled from the features layer with the probabilities of which country is most likely
        region_logits = self.region_head(region_input)
        region_soft = torch.softmax(region_logits, dim = 1)
        
        #predict region conditioned on country
        subregion_input = torch.cat([x,region_soft], dim = 1)
        subregion_logits = self.subregion_head(subregion_input)
        subregion_soft = torch.softmax(subregion_logits, dim = 1)
        
        #predict coord deltas conditioned on city
        coord_input = torch.cat([x,subregion_soft], dim = 1)
        delta_coords = self.coord_regressor(coord_input)
        
        return {
    "country_logits": country_logits,
    "region_logits": region_logits,
    "city_logits": subregion_logits,
    "delta_coords": delta_coords
}
        
        


#checking output
model = GeoClassifier(1,1,1).to(device)
summary(model = model, input_size = (32,3,224,224), col_names = ["input_size","output_size","num_params","trainable"], col_width = 20, row_settings=["var_names"])


c:\Users\Trevor Drummond\miniconda3\envs\geoguessr\Lib\site-packages\torch\__init__.py




Layer (type (var_name))                       Input Shape          Output Shape         Param #              Trainable
GeoClassifier (GeoClassifier)                 [32, 3, 224, 224]    [32, 2]              --                   Partial
├─ResNet (backbone)                           [32, 3, 224, 224]    [32, 2048]           --                   False
│    └─Conv2d (conv1)                         [32, 3, 224, 224]    [32, 64, 112, 112]   (9,408)              False
│    └─BatchNorm2d (bn1)                      [32, 64, 112, 112]   [32, 64, 112, 112]   (128)                False
│    └─ReLU (relu)                            [32, 64, 112, 112]   [32, 64, 112, 112]   --                   --
│    └─MaxPool2d (maxpool)                    [32, 64, 112, 112]   [32, 64, 56, 56]     --                   --
│    └─Sequential (layer1)                    [32, 64, 56, 56]     [32, 256, 56, 56]    --                   False
│    │    └─Bottleneck (0)                    [32, 64, 56, 56]     [32, 256, 56,