In [1]:
# Install necessary libraries
%pip install pandas numpy tensorflow scikit-learn

# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


Collecting pandas
  Downloading pandas-2.2.2-cp311-cp311-win_amd64.whl (11.6 MB)
     ---------------------------------------- 11.6/11.6 MB 6.6 MB/s eta 0:00:00
Collecting numpy
  Downloading numpy-2.0.1-cp311-cp311-win_amd64.whl (16.6 MB)
     --------------------------------------- 16.6/16.6 MB 11.3 MB/s eta 0:00:00
Collecting tensorflow
  Downloading tensorflow-2.17.0-cp311-cp311-win_amd64.whl (2.0 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.5.1-cp311-cp311-win_amd64.whl (11.0 MB)
     --------------------------------------- 11.0/11.0 MB 14.2 MB/s eta 0:00:00
Collecting pytz>=2020.1
  Downloading pytz-2024.1-py2.py3-none-any.whl (505 kB)
     -------------------------------------- 505.5/505.5 kB 5.3 MB/s eta 0:00:00
Collecting tzdata>=2022.7
  Downloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
     ------------------------------------- 345.4/345.4 kB 10.8 MB/s eta 0:00:00
Collecting tensorflow-intel==2.17.0
  Downloading tensorflow_intel-2.17.0-cp311-cp311-win_amd6


[notice] A new release of pip available: 22.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Data Preperation

In [7]:
# Load the dataset
data = pd.read_csv('../data/nhldraft.csv')

# Explore the data
print(data.head())
print(data.info())
print(data.describe())

# Handle missing values (if any)
data = data.dropna()

# Encode categorical variables
data = pd.get_dummies(data, columns=['team', 'nationality', 'position', 'amateur_team'])

# Split the data into features and target
features = data.drop(['player'], axis=1)  # Dropping the player column as it is not used for predictions
target = data['overall_pick']  # Assuming we want to predict the overall pick position

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Normalize/scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


   id  year  overall_pick                 team            player nationality  \
0   1  2022             1   Montreal Canadiens  Juraj Slafkovsky          SK   
1   2  2022             2    New Jersey Devils       Simon Nemec          SK   
2   3  2022             3      Arizona Coyotes      Logan Cooley          US   
3   4  2022             4       Seattle Kraken      Shane Wright          CA   
4   5  2022             5  Philadelphia Flyers   Cutter Gauthier          SE   

  position   age  to_year                           amateur_team  ...  points  \
0       LW  18.0      NaN                          TPS (Finland)  ...     NaN   
1        D  18.0      NaN                    HK Nitra (Slovakia)  ...     NaN   
2        C  18.0      NaN  USA U-18 Development Team (USDP/USHL)  ...     NaN   
3        C  18.0      NaN              Kingston Frontenacs (OHL)  ...     NaN   
4       LW  18.0      NaN  USA U-18 Development Team (USDP/USHL)  ...     NaN   

   plus_minus  penalties_minutes

Model Building

In [8]:
# Define the model architecture
model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=[X_train.shape[1]]),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model
history = model.fit(X_train, y_train, epochs=100, validation_split=0.2)


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - loss: 13096.4736 - mae: 96.3819 - val_loss: 12130.6113 - val_mae: 88.8310
Epoch 2/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 12045.2734 - mae: 91.1363 - val_loss: 11583.9053 - val_mae: 85.8422
Epoch 3/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 12234.1562 - mae: 90.3065 - val_loss: 10921.8945 - val_mae: 82.1884
Epoch 4/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 11779.3760 - mae: 87.7689 - val_loss: 10147.1152 - val_mae: 77.9405
Epoch 5/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 9735.6318 - mae: 76.7808 - val_loss: 9282.3467 - val_mae: 73.2105
Epoch 6/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 7504.1597 - mae: 65.1183 - val_loss: 8311.0234 - val_mae: 68.4408
Epoch 7/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━

Model Evalutation

In [9]:
# Evaluate the model on the testing data
loss, mae = model.evaluate(X_test, y_test)
print(f"Mean Absolute Error: {mae}")

# Make predictions
predictions = model.predict(X_test)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2962.4221 - mae: 42.1190 
Mean Absolute Error: 43.295406341552734
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0s/step  


Save

In [10]:
# Save the trained model
model.save('draft_prediction_model.h5')

# Save predictions to a CSV file
predicted_df = pd.DataFrame({'Actual': y_test, 'Predicted': predictions.flatten()})
predicted_df.to_csv('predictions.csv', index=False)




extract data


In [None]:
%pip install pytesseract opencv-python pillow
import pytesseract
import cv2
from PIL import Image
import pandas as pd

# Load the image
image_path = '/mnt/data/image.png'
image = cv2.imread(image_path)

# Convert the image to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Use OCR to extract text
text = pytesseract.image_to_string(gray)

# Print extracted text
print(text)
