In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Data Preparation

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
train_df.head()

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15,T16,T17,T18,target
0,-70,-61,-66,-53,-51,-63,-82,-57,-76,-78,-66,-66,-61,-59,-73,-75,-63,-77,B37
1,-77,-74,-71,-76,-65,-63,-66,-52,-55,-75,-72,-75,-74,-61,-64,-63,-53,-63,B61
2,-53,-38,-55,-66,-62,-62,-65,-70,-62,-52,-56,-53,-66,-68,-72,-60,-68,-77,A19
3,-72,-62,-59,-65,-65,-65,-78,-82,-83,-59,-84,-60,-64,-83,-69,-72,-95,-73,A22
4,-67,-69,-65,-63,-59,-53,-70,-72,-71,-60,-61,-57,-54,-76,-61,-66,-71,-80,A33


In [4]:
test_df.head()

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15,T16,T17,T18
0,-76,-83,-70,-66,-64,-72,-64,-69,-60,-76,-83,-78,-81,-81,-81,-70,-60,-60
1,-58,-57,-78,-81,-73,-73,-78,-78,-82,-49,-55,-58,-66,-79,-72,-83,-74,-80
2,-70,-70,-71,-69,-69,-68,-61,-55,-53,-82,-87,-76,-68,-57,-64,-75,-57,-70
3,-71,-61,-56,-56,-61,-60,-68,-66,-72,-58,-55,-56,-58,-62,-61,-59,-64,-65
4,-72,-71,-64,-69,-64,-63,-61,-42,-55,-61,-69,-67,-63,-63,-55,-49,-49,-57


In [5]:
train_df.isnull().sum()

T1        0
T2        0
T3        0
T4        0
T5        0
T6        0
T7        0
T8        0
T9        0
T10       0
T11       0
T12       0
T13       0
T14       0
T15       0
T16       0
T17       0
T18       0
target    0
dtype: int64

In [6]:
test_df.isnull().sum()

T1     0
T2     0
T3     0
T4     0
T5     0
T6     0
T7     0
T8     0
T9     0
T10    0
T11    0
T12    0
T13    0
T14    0
T15    0
T16    0
T17    0
T18    0
dtype: int64

In [7]:
x_train = train_df.drop(columns=['target'])
y_train = train_df['target']

In [8]:
x_test = test_df

In [9]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.fit_transform(x_test)

In [10]:
x_train_scaled

array([[-0.47321406,  0.39508167, -0.17481688, ..., -1.03062689,
         0.28899656, -0.98195758],
       [-1.27438905, -1.06322685, -0.78798229, ...,  0.10784997,
         1.28186418,  0.33869147],
       [ 1.47249662,  2.97516597,  1.17414703, ...,  0.39246918,
        -0.20743726, -0.98195758],
       ...,
       [ 2.73148589,  0.7316144 , -0.91061537, ..., -0.55626153,
        -0.90244459, -0.69896135],
       [ 1.81585733,  1.62903503, -0.6653492 , ..., -0.93575382,
        -0.60458431, -1.35928588],
       [-0.01539979,  0.05854893, -0.6653492 , ...,  0.29759611,
        -0.10815049,  0.15002732]])

In [11]:
x_test_scaled

array([[-1.15680286, -2.05114505, -0.67103324, ..., -0.54792197,
         0.59284254,  0.63077694],
       [ 0.89896217,  0.83256454, -1.64957776, ..., -1.78823581,
        -0.80716581, -1.279845  ],
       [-0.47154785, -0.60929025, -0.79335131, ..., -1.02496576,
         0.89284433, -0.32453403],
       ...,
       [-1.15680286, -0.27655453, -0.54871518, ..., -0.16628695,
         1.09284552,  1.20396352],
       [-0.35733868, -1.49658551, -0.05944291, ...,  0.02453057,
        -0.00716104, -0.22900293],
       [-0.92838452, -0.38746644, -0.67103324, ...,  0.69239186,
         1.39284731,  1.01290133]])

# Model Selection and Training and Saving them as Pickle Files

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [17]:
# Initialize Models
models = {
    'Logistic Regression' : LogisticRegression(max_iter = 1000),
    'Decision Tree' : DecisionTreeClassifier(),
    'Random Forest' : RandomForestClassifier(n_estimators = 100),
    'SVM' : SVC()
}

In [16]:
# Dictionary to store training accuracies
train_accuracies = {}

In [25]:
import joblib

In [26]:
for name, model in models.items():
    model.fit(x_train_scaled, y_train)
    y_train_pred = model.predict(x_train_scaled)
    accuracy = accuracy_score(y_train, y_train_pred)
    train_accuracies[name] = accuracy
    print(f'{name} Training Accuracy : {accuracy}')
    model_filename = f'{name.replace(" ", "_").lower()}_model.pkl'
    joblib.dump(model, model_filename)
    print(f'Model saved as {model_filename}')

Logistic Regression Training Accuracy : 0.9770352633870265
Model saved as logistic_regression_model.pkl
Decision Tree Training Accuracy : 0.9994286025250326
Model saved as decision_tree_model.pkl
Random Forest Training Accuracy : 0.9994286025250326
Model saved as random_forest_model.pkl
SVM Training Accuracy : 0.9901229865041359
Model saved as svm_model.pkl


# Model Testing and Prediction

In [27]:
test_predictions = {}

for name, model in models.items():
    y_test_pred = model.predict(x_test_scaled)
    test_predictions[name] = y_test_pred
    
    pd.DataFrame(y_test_pred, columns=['target']).to_csv(f'test_predictions_{name.replace(" ", "_")}.csv', index=False)

# Report Accuracy and Choose the Best Model

In [28]:
# Report training accuracies
print("\nTraining Accuracies:")
for name, accuracy in train_accuracies.items():
    print(f'{name}: {accuracy}')

# Justification for model choice
# The model with the highest training accuracy will be our primary choice.
best_model_name = max(train_accuracies, key=train_accuracies.get)
best_model_accuracy = train_accuracies[best_model_name]
print(f'\nBest Model: {best_model_name} with Training Accuracy: {best_model_accuracy}')

# Assuming we need to provide the prediction file for the best model
best_model_predictions = test_predictions[best_model_name]
pd.DataFrame(best_model_predictions, columns=['target']).to_csv('best_model_test_predictions.csv', index=False)


Training Accuracies:
Logistic Regression: 0.9770352633870265
Decision Tree: 0.9994286025250326
Random Forest: 0.9994286025250326
SVM: 0.9901229865041359

Best Model: Decision Tree with Training Accuracy: 0.9994286025250326


# Streamlit App

In [1]:
import streamlit as st
import pandas as pd
import joblib

# Load the test data
test_data = pd.read_csv(r"D:\Internship tasks\Task1and2\test.csv")

# Load the trained models
model_files = {
    'Logistic Regression': r'D:\Internship tasks\Task1and2\logistic_regression_model.pkl',
    'Decision Tree': r'D:\Internship tasks\Task1and2\decision_tree_model.pkl',
    'Random Forest': r'D:\Internship tasks\Task1and2\random_forest_model.pkl',
    'SVM': r'D:\Internship tasks\Task1and2\svm_model.pkl'
}

# Function to make predictions
def predict(model_name, data):
    model = joblib.load(model_files[model_name])
    predictions = model.predict(data)
    return predictions

# Streamlit UI
st.title('Model Prediction')

# Display the test data
st.write('Test Data:')
st.write(test_data)

# Input box for user to enter data
user_input = st.text_input('Enter your data separated by spaces:')
if user_input:
    user_data = [float(val) for val in user_input.split()]
    user_data = [user_data]  # Convert to 2D array
    selected_model = st.selectbox('Select a model:', list(model_files.keys()))
    if st.button('Predict'):
        predictions = predict(selected_model, user_data)
        st.write('Predictions:')
        st.write(predictions)

2024-05-25 21:04:13.382 
  command:

    streamlit run C:\Users\saran\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2024-05-25 21:04:13.393 Session state does not function when running a script without `streamlit run`
