In [1]:
import os
from pathlib import Path

p = Path.cwd()
os.chdir(p.parent)
os.getcwd()

'/root/repos/lightning/HousePricing'

In [2]:
from dotenv import dotenv_values
import os

envs = ["secret.env", "predict.env"]

for fenv in envs:
    file = os.path.join("env", fenv)
    config = dotenv_values(file)  # load sensitive variables
    print(config.keys())
    for c, v in config.items():
        os.environ[c] = v

odict_keys(['WANDB_API_KEY'])
odict_keys(['WANDB_NAME', 'WANDB_NOTES', 'WANDB_NOTEBOOK_NAME', 'WANDB_INFERENCE_MODEL'])


In [3]:
import torch
from src.data import HousePricingDataModule

import os
import wandb
import torch
from lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from src.model import NeuralNetwork
from src.data import HousePricingDataModule


# Preparing data to be used
data_module = HousePricingDataModule()
data_module.prepare_data()

# Requesting to data module the number of different features that are in the data
in_features = data_module.data_features()

best_model = os.environ["WANDB_INFERENCE_MODEL"]
project_name = os.environ["WANDB_NAME"]

accelerator = "gpu" if torch.cuda.is_available() else "cpu"

# Setting up the training configuration
config = {
    "accelerator": accelerator,
    "used_model": best_model,
    "in_features": in_features,
}


run = wandb.init(
    job_type="training",
    name=best_model,
    project=project_name,
    config=config,
)


artifact = run.use_artifact(f"deepsat/House Pricing/{best_model}:best", type="model")
artifact_dir = artifact.download()

# Defines the training instance
trainer = Trainer(
    accelerator=wandb.config["accelerator"],
)

# Defining the model to be training
model = NeuralNetwork.load_from_checkpoint(
    checkpoint_path=os.path.join(artifact_dir, "model.ckpt"),
    input_size=wandb.config["in_features"],
)

[INFO]: Skipping downloading data. Data is already downloaded
[INFO]: Dropping columns with full of NA or Identifiers. Current dataframe shape: (1314, 81)
[INFO]: Dropped columns with full of NA or Identifiers. Current dataframe shape: (1314, 75)
[INFO]: Dropping categories types with few ocurrence. Current dataframe shape: (1314, 75)
[INFO]: Few ocurrences removed. Current dataframe shape: (1295, 75)
[INFO]: Dropping columns which contains just one type of category. Current dataframe shape: (1295, 75)
[INFO]: Columns with just one type of categroy dropped. Current dataframe shape: (1295, 74)
[INFO]: Dropping columns with full of NA or Identifiers. Current dataframe shape: (146, 81)
[INFO]: Dropped columns with full of NA or Identifiers. Current dataframe shape: (146, 75)
[INFO]: Dropping columns with full of NA or Identifiers. Current dataframe shape: (1459, 80)
[INFO]: Dropped columns with full of NA or Identifiers. Current dataframe shape: (1459, 74)
[INFO]: Set up datasets: dict_ke

[34m[1mwandb[0m: Currently logged in as: [33mwilber-quito[0m ([33mdeepsat[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   1 of 1 files downloaded.  
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/root/miniconda3/envs/pricehousing/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


[INFO]: Input size: 244


In [4]:
model

NeuralNetwork(
  (net): Sequential(
    (0): Linear(in_features=244, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=128, bias=True)
    (5): ReLU()
    (6): Linear(in_features=128, out_features=64, bias=True)
    (7): ReLU()
    (8): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [5]:
prediction = trainer.predict(model, datamodule=data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/root/miniconda3/envs/pricehousing/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


[INFO]: Skipping downloading data. Data is already downloaded
[INFO]: Dropping columns with full of NA or Identifiers. Current dataframe shape: (1314, 81)
[INFO]: Dropped columns with full of NA or Identifiers. Current dataframe shape: (1314, 75)
[INFO]: Dropping categories types with few ocurrence. Current dataframe shape: (1314, 75)
[INFO]: Few ocurrences removed. Current dataframe shape: (1295, 75)
[INFO]: Dropping columns which contains just one type of category. Current dataframe shape: (1295, 75)
[INFO]: Columns with just one type of categroy dropped. Current dataframe shape: (1295, 74)
[INFO]: Dropping columns with full of NA or Identifiers. Current dataframe shape: (146, 81)
[INFO]: Dropped columns with full of NA or Identifiers. Current dataframe shape: (146, 75)
[INFO]: Dropping columns with full of NA or Identifiers. Current dataframe shape: (1459, 80)
[INFO]: Dropped columns with full of NA or Identifiers. Current dataframe shape: (1459, 74)
[INFO]: Set up datasets: dict_ke

In [6]:
len(prediction)

12

In [7]:
prediction

[{'id': tensor([1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469, 1470, 1471, 1472,
          1473, 1474, 1475, 1476, 1477, 1478, 1479, 1480, 1481, 1482, 1483, 1484,
          1485, 1486, 1487, 1488, 1489, 1490, 1491, 1492, 1493, 1494, 1495, 1496,
          1497, 1498, 1499, 1500, 1501, 1502, 1503, 1504, 1505, 1506, 1507, 1508,
          1509, 1510, 1511, 1512, 1513, 1514, 1515, 1516, 1517, 1518, 1519, 1520,
          1521, 1522, 1523, 1524, 1525, 1526, 1527, 1528, 1529, 1530, 1531, 1532,
          1533, 1534, 1535, 1536, 1537, 1538, 1539, 1540, 1541, 1542, 1543, 1544,
          1545, 1546, 1547, 1548, 1549, 1550, 1551, 1552, 1553, 1554, 1555, 1556,
          1557, 1558, 1559, 1560, 1561, 1562, 1563, 1564, 1565, 1566, 1567, 1568,
          1569, 1570, 1571, 1572, 1573, 1574, 1575, 1576, 1577, 1578, 1579, 1580,
          1581, 1582, 1583, 1584, 1585, 1586, 1587, 1588], dtype=torch.int32),
  'prediction': tensor([134337.3281, 166749.8750, 188016.5000, 200668.7188, 180327.2344,
      

In [8]:
def prediction_to_submit(prediction_batches):

    submit = {"Id": [], "SalePrice": []}

    for prediction_batch in prediction_batches:
        ids = prediction_batch["id"].tolist()
        predictions = prediction_batch["prediction"].tolist()

        submit["Id"] = submit["Id"] + ids
        submit["SalePrice"] = submit["SalePrice"] + predictions

    return submit

In [9]:
submit = prediction_to_submit(prediction)

In [10]:
submit["Id"][:4]

[1461, 1462, 1463, 1464]

In [11]:
submit["SalePrice"][:4]

[134337.328125, 166749.875, 188016.5, 200668.71875]

In [12]:
import pandas as pd

submission = pd.DataFrame(data=submit)
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,134337.328125
1,1462,166749.875
2,1463,188016.5
3,1464,200668.71875
4,1465,180327.234375


In [13]:
!pwd

/root/repos/lightning/HousePricing


In [14]:
import pandas as pd

submission.to_csv("./submission.csv", index=False)

In [15]:
wandb.finish()