# 1. Variance Threshold  


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import VarianceThreshold

# Step 1: Preprocess the Text Data
documents = ['This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?']
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

# Step 2: Calculate the Variance of Each Word
variances = X.toarray().var(axis=0)

# Step 3: Set the Variance Threshold
threshold = 0.01

# Step 4: Remove Low-Variance Words
selector = VarianceThreshold(threshold=threshold)
X_high_variance = selector.fit_transform(X)

# Get the selected features (words)
selected_features = vectorizer.get_feature_names_out()[selector.get_support()]

# Print the selected features
print("Selected Features (High Variance Words):")
for feature in selected_features:
    print(feature)


Selected Features (High Variance Words):
and
document
first
one
second
third


In [2]:
print(X)

  (0, 8)	1
  (0, 3)	1
  (0, 6)	1
  (0, 2)	1
  (0, 1)	1
  (1, 8)	1
  (1, 3)	1
  (1, 6)	1
  (1, 1)	2
  (1, 5)	1
  (2, 8)	1
  (2, 3)	1
  (2, 6)	1
  (2, 0)	1
  (2, 7)	1
  (2, 4)	1
  (3, 8)	1
  (3, 3)	1
  (3, 6)	1
  (3, 2)	1
  (3, 1)	1


In [3]:
print(variances)

[0.1875 0.5    0.25   0.     0.1875 0.1875 0.     0.1875 0.    ]


# 2. Chi-Square test 

In [8]:
import pandas as pd
from scipy.stats import chi2_contingency

# Create a DataFrame with customer demographics and product preferences
data = {
    'Age': ['18-24', '25-34', '35-44', '45-54', '55+'],
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
    'Product_Category': ['Electronics', 'Fashion', 'Electronics', 'Fashion', 'Electronics']
}

df = pd.DataFrame(data)

# Create a contingency table
contingency_table = pd.crosstab(df['Age'], df['Product_Category'])

# Perform the Chi-square Test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

# Print the results
print("Chi-square Statistic:", chi2)
print("p-value:", p_value)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:")
print(pd.DataFrame(expected, index=contingency_table.index, columns=contingency_table.columns))


Chi-square Statistic: 5.000000000000001
p-value: 0.2872974951836456
Degrees of Freedom: 4
Expected Frequencies:
Product_Category  Electronics  Fashion
Age                                   
18-24                     0.6      0.4
25-34                     0.6      0.4
35-44                     0.6      0.4
45-54                     0.6      0.4
55+                       0.6      0.4


In [6]:
df

Unnamed: 0,Age,Gender,Product_Category
0,18-24,Male,Electronics
1,25-34,Female,Fashion
2,35-44,Male,Electronics
3,45-54,Female,Fashion
4,55+,Male,Electronics


In [7]:
contingency_table

Product_Category,Electronics,Fashion
Age,Unnamed: 1_level_1,Unnamed: 2_level_1
18-24,1,0
25-34,0,1
35-44,1,0
45-54,0,1
55+,1,0


# 3. Information Gain

In [17]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Generate sample email dataset
emails = [
    ("spam", "Get a free vacation now!"),
    ("spam", "Exclusive deal: Buy now and save 50%!"),
    ("legitimate", "Meeting tomorrow at 10 AM."),
    ("legitimate", "Reminder: Pay your bills on time."),
    ("legitimate", "Confirm your email subscription.")
]

df = pd.DataFrame(emails, columns=["label", "email"])

# Split the dataset into features and labels
X = df["email"]
y = df["label"]

# Feature extraction and encoding
vectorizer = CountVectorizer()
X_encoded = vectorizer.fit_transform(X)

# Feature selection using Information Gain
k = 2  # Select top 2 features
selector = SelectKBest(score_func=mutual_info_classif, k=k)
X_selected = selector.fit_transform(X_encoded, y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Build a Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = clf.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

  legitimate       0.00      0.00      0.00       0.0
        spam       0.00      0.00      0.00       1.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
print(X_encoded)

  (0, 12)	1
  (0, 11)	1
  (0, 22)	1
  (0, 14)	1
  (1, 14)	1
  (1, 10)	1
  (1, 8)	1
  (1, 6)	1
  (1, 3)	1
  (1, 18)	1
  (1, 1)	1
  (2, 13)	1
  (2, 21)	1
  (2, 4)	1
  (2, 0)	1
  (2, 2)	1
  (3, 17)	1
  (3, 16)	1
  (3, 23)	1
  (3, 5)	1
  (3, 15)	1
  (3, 20)	1
  (4, 23)	1
  (4, 7)	1
  (4, 9)	1
  (4, 19)	1


In [29]:
print(X_selected)

  (0, 0)	1
  (1, 0)	1
  (3, 1)	1
  (4, 1)	1
