In [1]:
!pip install streamlit pyngrok

Collecting streamlit
  Downloading streamlit-1.37.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl.metadata (7.4 kB)
Collecting tenacity<9,>=8.1.0 (from streamlit)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<5,>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.2-py3-none-manylinux2014_x86_64.whl.metadata (38 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading streamlit-1.37.1-py2.py3-none-any.whl (8.7 MB)
[2K   

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from pandas.errors import ParserError
import pickle

# Load the merged and labeled data, trying different delimiters
try:
    data = pd.read_csv('aditya.csv', sep=',')  # Try default comma first
except ParserError:
    try:
        data = pd.read_csv('aditya.csv', sep=';')  # Try semicolon
    except ParserError:
        data = pd.read_csv('aditya.csv', sep='\t')  # Try tab

# Print the column names to verify if 'domain' exists
print(data.columns)


Index(['domain', 'label'], dtype='object')


In [3]:
# Separate features (domain names) and labels
X = data['domain'].values  # Replace 'domain' with the correct column name if needed
y = data['label'].values

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

# Pad sequences to have the same length
maxlen = max([len(seq) for seq in X_sequences])
X_padded = pad_sequences(X_sequences, maxlen=maxlen)

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=32, input_length=maxlen))
model.add(LSTM(50, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(50))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=1, batch_size=32, validation_split=0.2)




[1m6731/6731[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1312s[0m 194ms/step - accuracy: 0.9520 - loss: 0.1116 - val_accuracy: 0.9986 - val_loss: 0.0051


<keras.src.callbacks.history.History at 0x7e44e92e6350>

In [5]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.2f}')

# Save the model and tokenizer
model.save('model.h5')

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

[1m2104/2104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 45ms/step - accuracy: 0.9986 - loss: 0.0054




Test Accuracy: 1.00


In [None]:
%%writefile app.py
import streamlit as st
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# Load the model and tokenizer
model = load_model('model.h5')

with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Define the maxlen (the length you used during training)
maxlen = 50  # Replace this with the actual maxlen used in your training

# Streamlit app setup
st.title("Domain Legitimacy Prediction")
st.write("Enter a domain name to check if it's legitimate or non-legitimate.")

# Get user input
domain_input = st.text_input("Domain name:")

if st.button("Predict"):
    if domain_input:
        # Preprocess the input
        new_sequences = tokenizer.texts_to_sequences([domain_input])
        new_padded = pad_sequences(new_sequences, maxlen=maxlen, padding='post')

        # Predict
        prediction = model.predict(new_padded)
        result = 'Non-Legitimate' if prediction > 0.5 else 'Legitimate'

        # Show result
        st.write(f"The domain '{domain_input}' is predicted to be **{result}**.")
    else:
        st.write("Please enter a domain name.")


In [None]:
from pyngrok import ngrok

# Start the Streamlit app
!streamlit run app.py &

# Create a tunnel to the Streamlit app
public_url = ngrok.connect(port='8501')
print(f'Public URL: {public_url}')



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.138.167.154:8501[0m
[0m
