<a href="https://colab.research.google.com/github/vidhan-tiwari/Fake_instagram_account_detection/blob/main/InstaFakeID_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from huggingface_hub import login
login()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [None]:
from datasets import load_dataset
dataset = load_dataset("nahiar/instagram_bot_detection")

In [None]:
df = dataset['train'].to_pandas()
df.head()

In [None]:
df.shape

In [None]:
# shuffle the DF before applying train test split
df_shuffled = df.sample(frac = 1, random_state = 42).reset_index(drop = True)

splitting_point = int(0.8*len(df))
train_df = df_shuffled.iloc[:splitting_point]
test_df = df_shuffled.iloc[splitting_point:]

In [None]:
train_df = train_df.sample(frac = 1,random_state = 42).reset_index(drop = True)
test_df = test_df.sample(frac = 1,random_state = 42).reset_index(drop = True )

In [None]:
train_df.head()


In [None]:
test_df.head()

In [None]:
len(train_df),len(test_df)

Here "nums/length" and nums/length_full_name" is ratio of numerical characters in its user name and its full name

In [None]:
# plot the distribution of all numeric features
binary_columns = ["profile pic","name==username","external URL","private","fake"]
non_binary_columns = [col for col in train_df.columns if col not in binary_columns]

In [None]:
len(non_binary_columns)

In [None]:
non_binary_columns

In [None]:
# calculating skewness of all the attributes
train_df[non_binary_columns].skew()

In [None]:
selected_columns = ['profile pic','nums/length username','fullname words','nums/length fullname','description length','external URL','#posts','#follows']
columns_to_drop = ['name==username','private','#followers']

In [None]:
selected_non_binary_columns = [col for col in selected_columns if col in non_binary_columns]
selected_non_binary_columns

In [None]:
for col in selected_non_binary_columns:
  print(f"{col} : skewness = {train_df[col].skew()} , range = {train_df[col].min()} - {train_df[col].max()}")

# Applying transformation techniques first

In [None]:
from sklearn.preprocessing import PowerTransformer
def transform(df, pt_train_data = None):
  # Explicitly create a copy to avoid SettingWithCopyWarnin
  df = df.copy()
  pt = PowerTransformer(method='yeo-johnson') # Initialize pt here

  if pt_train_data is None:
    df[non_binary_columns] = pt.fit_transform(df[non_binary_columns])
  else:
    pt = pt_train_data # Use provided transformer
    df[non_binary_columns] = pt.transform(df[non_binary_columns])


  return df,pt # Return the modified DataFrame

In [None]:
new_train_df ,pt_train_data= transform(train_df)

In [None]:
new_train_df[non_binary_columns].skew()

In [None]:
new_test_df,pt_train_data  = transform(test_df,pt_train_data)

In [None]:
new_test_df[non_binary_columns].skew()

# DATA SCALING

In [None]:
from sklearn.preprocessing import StandardScaler


def scale_dataframe(df, scaler = None):
    df_scaled = df.copy()
    columns = df_scaled.columns


    if scaler is None:
      scaler = StandardScaler()
      df_scaled_values = scaler.fit_transform(df_scaled)
    else:
      df_scaled_values = scaler.transform(df_scaled)
    df_scaled = pd.DataFrame(df_scaled_values, columns=columns)

    return df_scaled, scaler

In [None]:
X_train = new_train_df.drop('fake',axis = 1)
y_train = new_train_df['fake']
X_test = new_test_df.drop('fake',axis = 1)
y_test = new_test_df['fake']

In [None]:
X_train ,scaler_train_data = scale_dataframe(X_train)

In [None]:
X_test ,scaler_train_data = scale_dataframe(X_test,scaler_train_data)

# Applying Logistic regression on symmmetric data #
- Using Yeo-Johnson power transformer

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Logistic regression without sklearn (Custom CODE) #

In [None]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

In [None]:
def compute_cost_vectorized(X, w, b, y):
  m = X.shape[0]
  z = X @ w + b  # (m,n) @ (n,) -> (m,).
  f_wb = sigmoid(z) # (m,)

  cost = -y * np.log(f_wb) - (1 - y) * np.log(1 - f_wb)
  total_cost = np.sum(cost) / m

  return total_cost


In [None]:
def gradient_functions_vectorized(X, w, b, y):
  m, n = X.shape
  z = X @ w + b      # (m,) vector of z for all examples
  f_wb = sigmoid(z)  # (m,) vector of all predictions

  error = f_wb - y   # (m,) vector of all errors

  # (m,) * (m,n) is not what we want ,we  need (n,)
  # s0 we do (n,m) @ (m,) -> (n,)
  dj_dw = (X.T @ error) / m # transpose of X(m,n) is X.T(n,m)
  dj_db = np.sum(error) / m

  return dj_dw, dj_db

In [None]:
def train_vectorized(X, y, alpha, num_iters):
  cost_history =[]
  w = np.zeros(X.shape[1])
  b = 0
  for i in range(num_iters):
    dj_dw, dj_db = gradient_functions_vectorized(X, w, b, y)
    w = w - alpha * dj_dw
    b = b - alpha * dj_db

    if i % 5000 == 0:
        cost = compute_cost_vectorized(X, w, b, y)
        cost_history.append(cost)
        print(f"Iteration {i:5d}: Cost {cost:0.4f}")

  return w, b, cost_history


In [None]:
def predict_vectorized(X, w, b):
  z = X @ w + b
  f_wb = sigmoid(z)
  p = f_wb >= 0.5
  return p.astype(int)

In [None]:
w, b,cost_history = train_vectorized(X_train.values,y_train,0.01,50000)

In [None]:
y_pred = predict_vectorized(X_test.values,w,b)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(
    range(len(cost_history)),
    cost_history,
    color='blue',
    linestyle='solid',
    linewidth=2
)

plt.xlabel('Iteration', fontsize=12)
plt.ylabel('Cost', fontsize=12)
plt.title('Learning Curve (Cost vs. Iterations)', fontsize=14)
plt.show()


In [None]:
w,b

# Now_training Logistic Model of Logistic Regression (ridge (l2 regularized) and lasso(l1 regularized)

In [None]:
lasso_regression = LogisticRegression(penalty = 'l1',solver = 'liblinear',C = 1.0,max_iter = 100000,random_state = 42)
lasso_regression.fit(X_train,y_train)
y_pred = lasso_regression.predict(X_test)
accuracy_score(y_test,y_pred)


In [None]:
W = lasso_regression.coef_
b = lasso_regression.intercept_
W,b

In [None]:
W_lasso = np.array([-2.08145489,  1.84867303, -0.03856457,  0.29341673,  0.40523131,
         -0.34613844, -0.49467169, -0.53865868, -1.21093493, -4.67090307,
          1.85291686])
b_lasso = 0.72040019

In [None]:
i = 0
for col in train_df.columns:
  print(f"{col}  : {i}")
  i += 1

In [None]:
from sklearn.preprocessing import PowerTransformer
def process_and_predict_selected(record_selected,pt, scaler,w, b,threshold):
    """
    record_selected : 1-D numpy array containing ONLY the `selected_columns`
                      and in that exact order.
    """

    record_selected = record_selected.astype(float)
    # convert np array into dataframe
    # reshape 1D array into a 2D array with one row
    df_selected = pd.DataFrame(record_selected.reshape(1, -1), columns = X_train.columns)
    # transform the data_frame using up-written transform function
    transformed_selected_df,pt  = transform(df_selected,pt)
    # scale the data_frame using up_written scale function
    scaled_transformed_df ,scaler = scale_dataframe(transformed_selected_df,scaler)


    prob_fake = sigmoid(scaled_transformed_df.values @ w + b)[0]
    is_not_fake = prob_fake < threshold

    return is_not_fake, prob_fake

In [None]:
X_train.columns

In [None]:
all_records = {
    "Person 1": np.array([1, 0.25,  7, 0, 0, 19, 0, 1, 0, 277, 220]), # person_1
    "Person 2": np.array([1, 0.17, 22, 0, 0, 37, 0, 1, 7, 343, 308]), # person_2
    "Person 3": np.array([1, 0.16, 13, 0, 0, 0,  0, 1, 0, 302, 736]), # person_3
    "Person 4": np.array([0, 0.11, 17, 0, 0, 0,  0, 1, 0, 127, 247]), # person_4
    "Random 1": np.array([0, 0.05,  6, 0.15, 0, 32, 1,0, 34567,78, 69978]) # randomly selected
}

In [None]:
threshold = 0.50
for name, rec in all_records.items():
    ok, p = process_and_predict_selected(rec, pt_train_data, scaler_train_data, W_lasso, b_lasso,threshold)
    result = "Not-Fake " if ok else "Fake "
    print(f"{name:10s} â†’ {result}  (prob_fake = {p:.4f})")