In [4]:
import torch
import time
import sys

# Checking CUDA Availability

In [4]:
print(f"Pytorch version : {torch.__version__}")

Pytorch version : 2.7.0


In [7]:
print(f"CUDA available? : {torch.cuda.is_available()}")

CUDA available? : True


In [19]:
print(f"CUDA version : {torch.version.cuda}")

CUDA version : 12.8


In [13]:
print(f"Number of GPUs : {torch.cuda.device_count()}")
print(f"GPU memory : {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

Number of GPUs : 1
GPU memory : 42.41 GB


# Tensor Creation Basics

In [15]:
cpu_tensor = torch.randn(3,3)

In [18]:
print(f"CPU tensor : \n{cpu_tensor}")
print(f"Device : {cpu_tensor.device}")
print(f"Data Type : {cpu_tensor.dtype}")

CPU tensor : 
tensor([[ 1.0463,  0.2168,  1.0765],
        [ 0.8140,  1.1421, -0.7343],
        [ 0.0550, -0.8853,  1.2807]])
Device : cpu
Data Type : torch.float32


### Creating tensor directly on GPU

In [20]:
if torch.cuda.is_available(): 
    gpu_tensor = torch.randn(3,3, device = 'cuda')
    print(f"GPU Tensor : \n{gpu_tensor}")
    print(f"Device : {gpu_tensor.device}")
    print(f"Data Type : {gpu_tensor.dtype}")

GPU Tensor : 
tensor([[ 0.4631, -1.1864, -0.0991],
        [-0.4888,  1.6592, -2.2065],
        [-0.8809, -0.0747, -0.9419]], device='cuda:0')
Device : cuda:0
Data Type : torch.float32


In [21]:
# Alternate way to do that 

In [22]:
cpu_to_gpu = torch.randn(3,3).cuda()
print(f"Device : {cpu_to_gpu.device}")

Device : cuda:0


# Measure Device Transfer Overhead

In [23]:
sizes = [100, 1000, 10000, 100000]

In [None]:
print(f"{'Size':<15} {'CPU->GPU (ms)':<20} {'GPU->CPU (ms)':<20}")
for size in sizes: 
    cpu_tensor = torch.randn(size,size)

    # Measure CPU > GPU transfer
    start = time.time()
    gpu_tensor = cpu_tensor.cuda()
    torch.cuda.synchronize()
    cpu_to_gpu_time = (time.time() - start) * 1000

    # Measure GPU to CPU transfer 
    start = time.time()
    back_to_cpu = gpu_tensor.cpu()
    gpu_to_cpu_time = (time.time() - start) * 1000

    print(f"{size:<15} {cpu_to_gpu_time:<20.4f} {gpu_to_cpu_time:<20.4f}")

Size            CPU->GPU (ms)        GPU->CPU (ms)       
100             0.1645               0.0489              
1000            0.3083               2.6274              
10000           22.9452              248.5583            


# Tensor Operations on GPU
```markdown
Questions to explore
1. Do operations require tensors to be on the same device? 
2. What happens if you mix CPU and GPU tensors? 
3. How does broadcasting work on GPU? 

In [7]:
# Create tensors on GPU
device = 'cuda'
a = torch.randn(3,3, device=device)
b = torch.randn(3,3, device=device)

In [8]:
print("Addition: ")
c = a + b 
print(f"Result device : {c.device}")

Addition: 
Result device : cuda:0


In [9]:
print("Matrix Multiplication: ")
c = torch.mm(a,b)
print(f"Result device : {c.device}")

Matrix Multiplication: 
Result device : cuda:0


In [10]:
# Broadcasting 
print("Broadcasting (tensor + scalar)")
e = a + 5.0
print(f"Result shape: {e.shape}, device: {e.device}")

Broadcasting (tensor + scalar)
Result shape: torch.Size([3, 3]), device: cuda:0


In [None]:
# Demonstrate device mismatch error
if torch.cuda.is_available(): 
    print(f"\n Attempting CPU + GPU operation: ")
    try: 
        cpu_tensor = torch.randn(3,3)
        gpu_tensor = torch.randn(3,3, device='cuda')
        result = cpu_tensor + gpu_tensor
    except RuntimeError as e: 
        print(f"Error: {e}")
        print(f"All tensors in an operation must be on the same device")
        
    