<a href="https://colab.research.google.com/github/vijays4404/TimeSeries/blob/main/NumberPrediction5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
data = []

# Iterate over years from 2024 to 2010
for year in range(2024, 2010, -1):
    # Construct URL for the year
    year_url = f"https://www.nylottery.org/take-5/past-winning-numbers/{year}"

    # Send a GET request to the URL
    response = requests.get(year_url)

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table element
    table = soup.find('table')

    # Iterate through each row in the table
    for row in table.find_all('tr'):
        # Extract data from each cell in the row
        cells = row.find_all('td')
        if len(cells) > 0:
            date = cells[0].text.strip()
            midday_numbers = [number.text for number in cells[1].find_all('span')[0:5]]
            evening_numbers = [number.text for number in cells[1].find_all('span')[5:10]]
            midday_prize = cells[2].find('strong').text.strip()
            # Check if there is a second strong tag before accessing it
            evening_prize = cells[2].find_all('strong')[1].text.strip() if len(cells[2].find_all('strong')) > 1 else None
            data.append([date, midday_numbers, evening_numbers, midday_prize, evening_prize])

# Create a DataFrame
df = pd.DataFrame(data, columns=['Date', 'Midday Numbers', 'Evening Numbers', 'Midday Prize', 'Evening Prize'])




In [2]:
df=df.drop(df.columns[2:],axis=1)

In [3]:
df.columns

Index(['Date', 'Midday Numbers'], dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4811 entries, 0 to 4810
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Date            4811 non-null   object
 1   Midday Numbers  4811 non-null   object
dtypes: object(2)
memory usage: 75.3+ KB


In [5]:
df['Date']=pd.to_datetime(df['Date'])

In [6]:
# Split the lists in 'Midday Numbers' column into separate columns
df[['Num1', 'Num2', 'Num3', 'Num4', 'Num5']] = pd.DataFrame(df['Midday Numbers'].tolist(), index=df.index)

# Drop the original 'Midday Numbers' column
df.drop('Midday Numbers', axis=1, inplace=True)



In [7]:
df

Unnamed: 0,Date,Num1,Num2,Num3,Num4,Num5
0,2024-03-15,9,10,12,26,38
1,2024-03-14,2,5,6,15,39
2,2024-03-13,2,4,8,10,20
3,2024-03-12,11,14,19,31,37
4,2024-03-11,7,14,15,21,25
...,...,...,...,...,...,...
4806,2011-01-05,2,11,15,21,39
4807,2011-01-04,4,5,12,26,36
4808,2011-01-03,14,18,19,21,38
4809,2011-01-02,4,5,7,14,32


In [8]:
# Save data as lagged data with one time step
lag_df = pd.concat([df, df.shift(1).add_prefix('lag_')], axis=1)
lag_df_prediction=lag_df.iloc[0]
lag_df=lag_df.dropna()




In [9]:
lag_prediction=lag_df_prediction[0:6]


In [10]:
lag_prediction=pd.DataFrame(lag_prediction)
lag_prediction=lag_prediction.transpose()
lag_prediction
lag_prediction['Month'] = lag_prediction['Date'].dt.month
lag_prediction['Day'] = lag_prediction['Date'].dt.day
lag_prediction['Year'] = lag_prediction['Date'].dt.year
lag_prediction=lag_prediction.drop('Date',axis=1)
lag_prediction

Unnamed: 0,Num1,Num2,Num3,Num4,Num5,Month,Day,Year
0,9,10,12,26,38,3,15,2024


In [11]:
lag_df

Unnamed: 0,Date,Num1,Num2,Num3,Num4,Num5,lag_Date,lag_Num1,lag_Num2,lag_Num3,lag_Num4,lag_Num5
1,2024-03-14,2,5,6,15,39,2024-03-15,9,10,12,26,38
2,2024-03-13,2,4,8,10,20,2024-03-14,2,5,6,15,39
3,2024-03-12,11,14,19,31,37,2024-03-13,2,4,8,10,20
4,2024-03-11,7,14,15,21,25,2024-03-12,11,14,19,31,37
5,2024-03-10,7,14,15,28,29,2024-03-11,7,14,15,21,25
...,...,...,...,...,...,...,...,...,...,...,...,...
4806,2011-01-05,2,11,15,21,39,2011-01-06,20,21,28,32,34
4807,2011-01-04,4,5,12,26,36,2011-01-05,2,11,15,21,39
4808,2011-01-03,14,18,19,21,38,2011-01-04,4,5,12,26,36
4809,2011-01-02,4,5,7,14,32,2011-01-03,14,18,19,21,38


In [12]:
lag_df.columns

Index(['Date', 'Num1', 'Num2', 'Num3', 'Num4', 'Num5', 'lag_Date', 'lag_Num1',
       'lag_Num2', 'lag_Num3', 'lag_Num4', 'lag_Num5'],
      dtype='object')

In [13]:
df=lag_df.drop('lag_Date',axis=1)

In [14]:
df

Unnamed: 0,Date,Num1,Num2,Num3,Num4,Num5,lag_Num1,lag_Num2,lag_Num3,lag_Num4,lag_Num5
1,2024-03-14,2,5,6,15,39,9,10,12,26,38
2,2024-03-13,2,4,8,10,20,2,5,6,15,39
3,2024-03-12,11,14,19,31,37,2,4,8,10,20
4,2024-03-11,7,14,15,21,25,11,14,19,31,37
5,2024-03-10,7,14,15,28,29,7,14,15,21,25
...,...,...,...,...,...,...,...,...,...,...,...
4806,2011-01-05,2,11,15,21,39,20,21,28,32,34
4807,2011-01-04,4,5,12,26,36,2,11,15,21,39
4808,2011-01-03,14,18,19,21,38,4,5,12,26,36
4809,2011-01-02,4,5,7,14,32,14,18,19,21,38


In [15]:
prediction=df[df.columns[6:]]
train_df=df[df.columns[0:6]]


In [16]:


# Assuming 'Date' column is already in datetime format
train_df['Month'] = train_df['Date'].dt.month
train_df['Day'] = train_df['Date'].dt.day
train_df['Year'] = train_df['Date'].dt.year


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Month'] = train_df['Date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Day'] = train_df['Date'].dt.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Year'] = train_df['Date'].dt.year


In [17]:

train_df=train_df.drop('Date',axis=1)


In [18]:
scaler=StandardScaler()
scaler.fit(train_df)
train_df = pd.DataFrame(scaler.transform(train_df), columns=train_df.columns, index=train_df.index)

In [19]:
lag_prediction

lag_prediction= pd.DataFrame(scaler.transform(lag_prediction), columns=lag_prediction.columns, index=lag_prediction.index)
print(lag_prediction)

       Num1     Num2      Num3      Num4      Num5     Month       Day  \
0  0.448123 -0.49054 -1.126343 -0.077357  0.908628 -0.992373 -0.079665   

       Year  
0  1.807555  


In [20]:
train_df

Unnamed: 0,Num1,Num2,Num3,Num4,Num5,Month,Day,Year
1,-0.911428,-1.247664,-1.988509,-1.743863,1.095529,-0.992373,-0.193269,1.807555
2,-0.911428,-1.399088,-1.701120,-2.501366,-2.455595,-0.992373,-0.306874,1.807555
3,0.836566,0.115158,-0.120482,0.680146,0.721726,-0.992373,-0.420478,1.807555
4,0.059680,0.115158,-0.695260,-0.834859,-1.521089,-0.992373,-0.534083,1.807555
5,0.059680,0.115158,-0.695260,0.225644,-0.773484,-0.992373,-0.647687,1.807555
...,...,...,...,...,...,...,...,...
4806,-0.911428,-0.339116,-0.695260,-0.834859,1.095529,-1.567852,-1.215709,-1.607863
4807,-0.522985,-1.247664,-1.126343,-0.077357,0.534825,-1.567852,-1.329314,-1.607863
4808,1.419230,0.720857,-0.120482,-0.834859,0.908628,-1.567852,-1.442918,-1.607863
4809,-0.522985,-1.247664,-1.844815,-1.895363,-0.212780,-1.567852,-1.556523,-1.607863


In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Assuming you have a variable name prefix and looping variable i
prefix = "Number"
models=[]
# Assuming you have your features X and target variable y prepared
for i in range(0,len(prediction.columns)):
# Split the data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(train_df, prediction[prediction.columns[i]], test_size=0.2, random_state=42)

  # Instantiate the Random Forest Regressor model
  rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

  # Train the model on the training data
  rf_regressor.fit(X_train, y_train)
  variable_name = "{}{}".format(prefix, i)
  # Make predictions on the testing data
  y_pred = rf_regressor.predict(X_test)
  print(y_test.iloc[0])
  print(variable_name[0])
  # Evaluate the model using Mean Squared Error
  mse = mean_squared_error(y_test, y_pred)
  print("Mean Squared Error:", mse)


4
N
Mean Squared Error: 26.46570395010395
6
N
Mean Squared Error: 46.00411964656965
7
N
Mean Squared Error: 53.364386382536374
19
N
Mean Squared Error: 47.116352390852384
28
N
Mean Squared Error: 30.848038461538465


In [22]:
y_test

1449    28
2935    36
795     35
1030    36
9       38
        ..
1021    36
2806    36
199     33
4013    16
1489    28
Name: lag_Num5, Length: 962, dtype: object

In [23]:
y_test.iloc[0]

'28'

In [24]:
lag_df.iloc[0]

Date        2024-03-14 00:00:00
Num1                          2
Num2                          5
Num3                          6
Num4                         15
Num5                         39
lag_Date    2024-03-15 00:00:00
lag_Num1                      9
lag_Num2                     10
lag_Num3                     12
lag_Num4                     26
lag_Num5                     38
Name: 1, dtype: object

In [25]:
X_train,x_test,y_train,y_test=train_test_split(train_df,prediction[prediction.columns[0]],train_size=0.7,random_state=123)

In [26]:
model_number1=RandomForestRegressor()
model_number1.fit(X_train,y_train)
print(model_number1.score(x_test,y_test))

-0.03393721357154211


In [27]:
X_train,x_test,y_train,y_test=train_test_split(train_df,prediction[prediction.columns[1]],train_size=0.7,random_state=123)
model_number2=RandomForestRegressor()
model_number2.fit(X_train,y_train)
print(model_number2.score(x_test,y_test))

-0.04549246294727771


In [28]:
X_train,x_test,y_train,y_test=train_test_split(train_df,prediction[prediction.columns[2]],train_size=0.7,random_state=123)
model_number3=RandomForestRegressor()
model_number3.fit(X_train,y_train)
print(model_number3.score(x_test,y_test))

-0.07925761279012855


In [29]:
X_train,x_test,y_train,y_test=train_test_split(train_df,prediction[prediction.columns[3]],train_size=0.7,random_state=123)
model_number4=RandomForestRegressor()
model_number4.fit(X_train,y_train)
print(model_number4.score(x_test,y_test))

-0.08891747267279326


In [30]:
X_train,x_test,y_train,y_test=train_test_split(train_df,prediction[prediction.columns[4]],train_size=0.7,random_state=123)
model_number5=RandomForestRegressor()
model_number5.fit(X_train,y_train)
print(model_number5.score(x_test,y_test))

-0.08851141752939351


In [31]:
model_number1.predict(lag_prediction)

array([9.2])

In [32]:
model_number2.predict(lag_prediction)

array([16.32])

In [33]:
model_number3.predict(lag_prediction)

array([21.8])

In [34]:
model_number4.predict(lag_prediction)

array([26.03])

In [36]:
model_number5.predict(lag_prediction)

array([31.79])

In [37]:
[7,13,19,27,31]

[7, 13, 19, 27, 31]