In [None]:
!pip install pandas scikit-learn matplotlib seaborn

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
data=pd.read_csv("/content/salarydata.csv")

In [None]:
# Load dataset
df = pd.read_csv('salarydata.csv')  # Use the actual uploaded filename
print("\nDataset Sample:")
print(df.head())

In [None]:
# Load your dataset
df = pd.read_csv('salarydata.csv')

# Display basic info
print("First 5 rows of data:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nMissing Values:")
print(df.isnull().sum())


In [None]:
#Descriptive statistics
print("\nDescriptive Statistics:")
print(df.describe())

In [None]:
# Handle categorical variables if any
df = pd.get_dummies(df, drop_first=True)

# Separate features and label
X = df.drop('Salary', axis=1)
y = df['Salary']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder



models = {
    "LogisticRegression": LogisticRegression(),
    "RandomForest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(),
    "GradientBoosting": GradientBoostingClassifier()
}

results = {}

for name, model in models.items():
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

lr_pred = lr_model.predict(X_test)

# Evaluation
print("🔹 Linear Regression:")
print("MSE:", mean_squared_error(y_test, lr_pred))
print("MAE:", mean_absolute_error(y_test, lr_pred))
print("R² Score:", r2_score(y_test, lr_pred))


In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

# Evaluation
print("\n Random Forest Regressor:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

In [None]:
kn_model = KNeighborsRegressor()
kn_model.fit(X_train, y_train)

y_pred = kn_model.predict(X_test)

# Evaluation
print("\n KNeighborsRegressor :")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib



# Define models
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(),
    "GradientBoosting": GradientBoostingClassifier()
}

results = {}

# Train and evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    results[name] = acc
    print(f"{name}: {acc:.4f}")

# Get best model
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
print(f"\n✅ Best model: {best_model_name} with accuracy {results[best_model_name]:.4f}")

# Save the best model
joblib.dump(best_model, "best_model.pkl")
print("✅ Saved best model as best_model.pkl")


In [None]:
plt.boxplot(data['Salary'])
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.bar(results.keys(), results.values(), color='skyblue')
plt.ylabel('Accuracy Score')
plt.title('Model Comparison')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(list(y_test.values), label="Actual", marker='o')
plt.plot(list(lr_pred), label="Linear Regression", marker='s')
plt.plot(list(y_pred), label="Random Forest", marker='x')
plt.title("Salary Prediction Comparison")
plt.xlabel("Test Data Index")
plt.ylabel("Salary")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
#Correlation heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()


In [None]:
# STEP 6: Pairplot
sns.pairplot(df)
plt.suptitle("Pairplot of Features", y=1.02)
plt.show()

# STEP 7: Feature & target definition
X = df[['Annually Rating', 'Total Working Experience']]
y = df['Salary']





In [None]:
# STEP 10: Plot Actual vs Predicted for Best Model
best_model = RandomForestRegressor(random_state=42)
best_model.fit(X_train, y_train)
y_pred_best = best_model.predict(X_test)

plt.scatter(y_test, y_pred_best, color='green')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.xlabel("Actual Salary")
plt.ylabel("Predicted Salary")
plt.title("Actual vs Predicted Salary (Random Forest)")
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import joblib

# Load the dataset
df = pd.read_csv("salarydata.csv")
X = df[['Annually Rating', 'Total Working Experience']]
y = df['Salary']

# Train model
model = LinearRegression()
model.fit(X, y)

# Save model
joblib.dump(model, 'salary_model.pkl')


In [None]:
code = '''
import streamlit as st
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

st.set_page_config(page_title="Salary Predictor", layout="centered")
menu = st.sidebar.radio("Navigation", ["Home", "EDA", "Batch Prediction", "About"])

if menu == "Home":
    st.title("💼 Salary Prediction App")
    st.subheader("Predict employee salary using ML models")

    model_option = st.selectbox("Select Model", ["Linear Regression", "Random Forest", "KNN"])

    if model_option == "Random Forest":
        model = joblib.load("rf_model.pkl")
    elif model_option == "KNN":
        model = joblib.load("knn_model.pkl")
    else:
        model = joblib.load("salary_model.pkl")

    rating = st.slider("Annually Rating", 1, 5, 3)
    experience = st.number_input("Total Working Experience (in years)", min_value=0.0, step=0.1)

    if st.button("Predict Salary"):
        input_data = np.array([[rating, experience]])
        predicted_salary = model.predict(input_data)[0]
        st.success(f"Predicted Salary: ₹{int(predicted_salary):,}")

elif menu == "EDA":
    st.title("📊 Exploratory Data Analysis")
    df = pd.read_csv("salarydata.csv")
    st.write("**Dataset Summary**")
    st.dataframe(df.describe())

    st.write("**Correlation Heatmap**")
    fig, ax = plt.subplots()
    sns.heatmap(df.corr(), annot=True, cmap="Blues", ax=ax)
    st.pyplot(fig)

elif menu == "Batch Prediction":
    st.title("📁 Batch Salary Prediction")
    uploaded_file = st.file_uploader("Upload CSV with 'Annually Rating' and 'Total Working Experience'", type="csv")
    if uploaded_file:
        batch_data = pd.read_csv(uploaded_file)
        model = joblib.load("salary_model.pkl")
        batch_data["Predicted Salary"] = model.predict(batch_data)
        st.dataframe(batch_data)
        csv = batch_data.to_csv(index=False).encode('utf-8')
        st.download_button("Download Predictions as CSV", csv, "predicted_salaries.csv", "text/csv")

elif menu == "About":
    st.title("📘 About This App")
    st.markdown(\"\"\"
    This is a Machine Learning-based web application that predicts salaries based on:
    - Annual performance rating
    - Total work experience

    Built using:
    - Streamlit
    - Scikit-learn
    - Google Colab
    - pyngrok

    Developed by: [Your Name]
    \"\"\")
'''

# Save to app.py
with open("app.py", "w") as f:
    f.write(code)


In [None]:
!pip install -q streamlit
!pip install -q pyngrok


In [None]:
from pyngrok import ngrok

# Kill previous tunnels
ngrok.kill()

# Setup your ngrok auth token (sign up at https://dashboard.ngrok.com to get one)
!ngrok config add-authtoken 30AxK0hxmTpRq2tG4ndU98h0c2a_86iRtu2fA858Yq5FqAFBx

In [None]:
# Run streamlit in background
!streamlit run app.py &>/content/logs.txt &

# Create ngrok tunnel (use correct protocol)
from pyngrok import ngrok
public_url = ngrok.connect("http://localhost:8501")
print("Streamlit app is live at:", public_url)

