In [16]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from scipy.stats import spearmanr

def check_linear_vs_nonlinear(df, target_col, top_n=5):
    results = []
    X = df.drop(columns=[target_col])
    y = df[target_col]

    for col in X.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            feature = df[[col]].values

            # Fit simple linear regression
            lr = LinearRegression().fit(feature, y)
            y_pred = lr.predict(feature)
            r2_linear = r2_score(y, y_pred)   # how well linear fits

            # Spearman correlation (monotonic check, captures non-linear monotonic)
            rho, _ = spearmanr(df[col], y)

            results.append({
                "feature": col,
                "linear_r2": round(r2_linear, 3),
                "spearman_rho": round(rho, 3),
                "likely_relation": "Linear"
                                  if r2_linear > 0.6 and abs(rho - r2_linear) < 0.1
                                  else "Non-Linear"
            })

    results_df = pd.DataFrame(results).sort_values(by="linear_r2", ascending=False)
    return results_df.head(top_n)


In [17]:
# Example usage
df = pd.read_csv("Placement.csv")
df = pd.get_dummies(df, drop_first=True) 
df

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary,gender_M,ssc_b_Others,hsc_b_Others,hsc_s_Commerce,hsc_s_Science,degree_t_Others,degree_t_Sci&Tech,workex_Yes,specialisation_Mkt&HR,status_Placed
0,1,67.00,91.00,58.00,55.0,58.80,270000.0,True,True,True,True,False,False,True,False,True,True
1,2,79.33,78.33,77.48,86.5,66.28,200000.0,True,False,True,False,True,False,True,True,False,True
2,3,65.00,68.00,64.00,75.0,57.80,250000.0,True,False,False,False,False,False,False,False,False,True
3,4,56.00,52.00,52.00,66.0,59.43,,True,False,False,False,True,False,True,False,True,False
4,5,85.80,73.60,73.30,96.8,55.50,425000.0,True,False,False,True,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,80.60,82.00,77.60,91.0,74.49,400000.0,True,True,True,True,False,False,False,False,False,True
211,212,58.00,60.00,72.00,74.0,53.62,275000.0,True,True,True,False,True,False,True,False,False,True
212,213,67.00,67.00,73.00,59.0,69.72,295000.0,True,True,True,True,False,False,False,True,False,True
213,214,74.00,66.00,58.00,70.0,60.23,204000.0,False,True,True,True,False,False,False,False,True,True


In [18]:
df.fillna(0, inplace=True)
df

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary,gender_M,ssc_b_Others,hsc_b_Others,hsc_s_Commerce,hsc_s_Science,degree_t_Others,degree_t_Sci&Tech,workex_Yes,specialisation_Mkt&HR,status_Placed
0,1,67.00,91.00,58.00,55.0,58.80,270000.0,True,True,True,True,False,False,True,False,True,True
1,2,79.33,78.33,77.48,86.5,66.28,200000.0,True,False,True,False,True,False,True,True,False,True
2,3,65.00,68.00,64.00,75.0,57.80,250000.0,True,False,False,False,False,False,False,False,False,True
3,4,56.00,52.00,52.00,66.0,59.43,0.0,True,False,False,False,True,False,True,False,True,False
4,5,85.80,73.60,73.30,96.8,55.50,425000.0,True,False,False,True,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,80.60,82.00,77.60,91.0,74.49,400000.0,True,True,True,True,False,False,False,False,False,True
211,212,58.00,60.00,72.00,74.0,53.62,275000.0,True,True,True,False,True,False,True,False,False,True
212,213,67.00,67.00,73.00,59.0,69.72,295000.0,True,True,True,True,False,False,False,True,False,True
213,214,74.00,66.00,58.00,70.0,60.23,204000.0,False,True,True,True,False,False,False,False,True,True


In [21]:


# Suppose target column is classification_yes
results = check_linear_vs_nonlinear(df, target_col="salary", top_n=15)

print("Feature relationship check:")
print(results)

Feature relationship check:
                  feature  linear_r2  spearman_rho likely_relation
15          status_Placed      0.750         0.816          Linear
1                   ssc_p      0.290         0.542      Non-Linear
2                   hsc_p      0.205         0.414      Non-Linear
3                degree_p      0.167         0.398      Non-Linear
13             workex_Yes      0.089         0.269      Non-Linear
14  specialisation_Mkt&HR      0.076        -0.258      Non-Linear
4                 etest_p      0.035         0.203      Non-Linear
6                gender_M      0.020         0.190      Non-Linear
5                   mba_p      0.020         0.125      Non-Linear
11        degree_t_Others      0.011        -0.089      Non-Linear
12      degree_t_Sci&Tech      0.006         0.103      Non-Linear
7            ssc_b_Others      0.001         0.057      Non-Linear
10          hsc_s_Science      0.001         0.030      Non-Linear
0                   sl_no      0.0

In [2]:
# Example usage
df = pd.read_csv("prep.csv")
df = pd.get_dummies(df, drop_first=True)  # encode categoricals

# Suppose target column is classification_yes
results = check_linear_vs_nonlinear(df, target_col="classification_yes", top_n=10)

print("Feature relationship check:")
print(results)


Feature relationship check:
      feature  linear_r2  spearman_rho likely_relation
9        hrmo      0.538        -0.787      Non-Linear
10        pcv      0.479        -0.774      Non-Linear
12         rc      0.352        -0.633      Non-Linear
21    htn_yes      0.348         0.590      Non-Linear
22     dm_yes      0.312         0.558      Non-Linear
2          al      0.283         0.593      Non-Linear
13       sg_b      0.259        -0.509      Non-Linear
4         bgr      0.162         0.460      Non-Linear
14       sg_c      0.161         0.401      Non-Linear
24  appet_yes      0.148        -0.385      Non-Linear


In [4]:
# Example usage
df = pd.read_csv("Placement.csv")
df = pd.get_dummies(df, drop_first=True) 
df

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary,gender_M,ssc_b_Others,hsc_b_Others,hsc_s_Commerce,hsc_s_Science,degree_t_Others,degree_t_Sci&Tech,workex_Yes,specialisation_Mkt&HR,status_Placed
0,1,67.00,91.00,58.00,55.0,58.80,270000.0,True,True,True,True,False,False,True,False,True,True
1,2,79.33,78.33,77.48,86.5,66.28,200000.0,True,False,True,False,True,False,True,True,False,True
2,3,65.00,68.00,64.00,75.0,57.80,250000.0,True,False,False,False,False,False,False,False,False,True
3,4,56.00,52.00,52.00,66.0,59.43,,True,False,False,False,True,False,True,False,True,False
4,5,85.80,73.60,73.30,96.8,55.50,425000.0,True,False,False,True,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,80.60,82.00,77.60,91.0,74.49,400000.0,True,True,True,True,False,False,False,False,False,True
211,212,58.00,60.00,72.00,74.0,53.62,275000.0,True,True,True,False,True,False,True,False,False,True
212,213,67.00,67.00,73.00,59.0,69.72,295000.0,True,True,True,True,False,False,False,True,False,True
213,214,74.00,66.00,58.00,70.0,60.23,204000.0,False,True,True,True,False,False,False,False,True,True


In [10]:
# Example usage
df = pd.read_csv("Placement.csv")
df = pd.get_dummies(df, drop_first=True)  # encode categoricals

# Suppose target column is classification_yes
results = check_linear_vs_nonlinear(df, target_col="status_Placed", top_n=10)

print("Feature relationship check:")
print(results)

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [7]:
df.isnull().sum()

sl_no                     0
ssc_p                     0
hsc_p                     0
degree_p                  0
etest_p                   0
mba_p                     0
salary                   67
gender_M                  0
ssc_b_Others              0
hsc_b_Others              0
hsc_s_Commerce            0
hsc_s_Science             0
degree_t_Others           0
degree_t_Sci&Tech         0
workex_Yes                0
specialisation_Mkt&HR     0
status_Placed             0
dtype: int64

In [8]:
df["salary"] = df["salary"].fillna(0)


In [9]:
df.isnull().sum()

sl_no                    0
ssc_p                    0
hsc_p                    0
degree_p                 0
etest_p                  0
mba_p                    0
salary                   0
gender_M                 0
ssc_b_Others             0
hsc_b_Others             0
hsc_s_Commerce           0
hsc_s_Science            0
degree_t_Others          0
degree_t_Sci&Tech        0
workex_Yes               0
specialisation_Mkt&HR    0
status_Placed            0
dtype: int64

In [11]:
# Example usage
df = pd.read_csv("Placement.csv")
df = pd.get_dummies(df, drop_first=True)  # encode categoricals

# Suppose target column is classification_yes
results = check_linear_vs_nonlinear(df, target_col="status_Placed", top_n=10)

print("Feature relationship check:")
print(results)

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [12]:
df

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary,gender_M,ssc_b_Others,hsc_b_Others,hsc_s_Commerce,hsc_s_Science,degree_t_Others,degree_t_Sci&Tech,workex_Yes,specialisation_Mkt&HR,status_Placed
0,1,67.00,91.00,58.00,55.0,58.80,270000.0,True,True,True,True,False,False,True,False,True,True
1,2,79.33,78.33,77.48,86.5,66.28,200000.0,True,False,True,False,True,False,True,True,False,True
2,3,65.00,68.00,64.00,75.0,57.80,250000.0,True,False,False,False,False,False,False,False,False,True
3,4,56.00,52.00,52.00,66.0,59.43,,True,False,False,False,True,False,True,False,True,False
4,5,85.80,73.60,73.30,96.8,55.50,425000.0,True,False,False,True,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,80.60,82.00,77.60,91.0,74.49,400000.0,True,True,True,True,False,False,False,False,False,True
211,212,58.00,60.00,72.00,74.0,53.62,275000.0,True,True,True,False,True,False,True,False,False,True
212,213,67.00,67.00,73.00,59.0,69.72,295000.0,True,True,True,True,False,False,False,True,False,True
213,214,74.00,66.00,58.00,70.0,60.23,204000.0,False,True,True,True,False,False,False,False,True,True


In [13]:

df.fillna(0, inplace=True)


In [14]:
df

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary,gender_M,ssc_b_Others,hsc_b_Others,hsc_s_Commerce,hsc_s_Science,degree_t_Others,degree_t_Sci&Tech,workex_Yes,specialisation_Mkt&HR,status_Placed
0,1,67.00,91.00,58.00,55.0,58.80,270000.0,True,True,True,True,False,False,True,False,True,True
1,2,79.33,78.33,77.48,86.5,66.28,200000.0,True,False,True,False,True,False,True,True,False,True
2,3,65.00,68.00,64.00,75.0,57.80,250000.0,True,False,False,False,False,False,False,False,False,True
3,4,56.00,52.00,52.00,66.0,59.43,0.0,True,False,False,False,True,False,True,False,True,False
4,5,85.80,73.60,73.30,96.8,55.50,425000.0,True,False,False,True,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,80.60,82.00,77.60,91.0,74.49,400000.0,True,True,True,True,False,False,False,False,False,True
211,212,58.00,60.00,72.00,74.0,53.62,275000.0,True,True,True,False,True,False,True,False,False,True
212,213,67.00,67.00,73.00,59.0,69.72,295000.0,True,True,True,True,False,False,False,True,False,True
213,214,74.00,66.00,58.00,70.0,60.23,204000.0,False,True,True,True,False,False,False,False,True,True


In [15]:
# Example usage
df = pd.read_csv("Placement.csv")
df = pd.get_dummies(df, drop_first=True)  # encode categoricals

# Suppose target column is classification_yes
results = check_linear_vs_nonlinear(df, target_col="status_Placed", top_n=10)

print("Feature relationship check:")
print(results)

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values