In [1]:
!pip install pandas numpy scikit-learn sentence-transformers torch


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sentence_transformers import SentenceTransformer, util
import torch

# Create main folders for organization
base_folder = os.getcwd()
folders = ['data', 'models', 'outputs']
for f in folders:
    os.makedirs(os.path.join(base_folder, f), exist_ok=True)

print("✅ Folder system ready (data/, models/, outputs/)")


✅ Folder system ready (data/, models/, outputs/)


In [5]:
data_folder = os.path.join(base_folder, 'data')
csv_path = os.path.join(data_folder, 'Software Questions.csv')

if not os.path.exists(csv_path):
    raise FileNotFoundError(
        f"❌ Dataset not found at {csv_path}. Please place your CSV file inside the 'data' folder."
    )

# Try with 'utf-8', fall back to 'latin1' if decoding fails
try:
    df = pd.read_csv(csv_path, encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv(csv_path, encoding='latin1')

print("✅ Dataset loaded successfully!")
print("Shape:", df.shape)
df.head()
print("✅ Dataset loaded successfully!")
print("Shape:", df.shape)
display(df.head())


✅ Dataset loaded successfully!
Shape: (200, 5)
✅ Dataset loaded successfully!
Shape: (200, 5)


Unnamed: 0,Question Number,Question,Answer,Category,Difficulty
0,1,What is the difference between compilation and...,Compilation translates source code into machin...,General Programming,Medium
1,2,Explain the concept of polymorphism.,Polymorphism allows objects of different class...,General Programming,Medium
2,3,Define encapsulation and give an example.,Encapsulation bundles data and methods in a cl...,General Programming,Hard
3,4,"What is an abstract class, and how is it diffe...",An abstract class can't be instantiated and ca...,General Programming,Medium
4,5,Describe the principles of Object-Oriented Pro...,"OOP principles include encapsulation, inherita...",General Programming,Medium


In [7]:

required_columns = ['Question', 'Category']
for col in required_columns:
    if col not in df.columns:
        raise ValueError(f"❌ Missing required column: '{col}' in dataset")

df.dropna(subset=['Question', 'Category'], inplace=True)
df['Question'] = df['Question'].astype(str).str.strip()
df['Category'] = df['Category'].astype(str).str.strip().str.lower()

print("✅ Data cleaned successfully!")
print(f"Total questions after cleaning: {len(df)}")


✅ Data cleaned successfully!
Total questions after cleaning: 200


In [8]:
label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['Category'])

print("\n📚 Unique Categories:")
print(list(label_encoder.classes_))



📚 Unique Categories:
['algorithms', 'artificial intelligence', 'back-end', 'data engineering', 'data structures', 'database and sql', 'database systems', 'devops', 'distributed systems', 'front-end', 'full-stack', 'general program', 'general programming', 'languages and frameworks', 'low-level systems', 'machine learning', 'networking', 'security', 'software testing', 'system design', 'version control', 'web development']


In [10]:
# ✂️ Train-Test Split
# Remove categories with only 1 sample (cannot stratify)
category_counts = df['category_encoded'].value_counts()
df = df[df['category_encoded'].isin(category_counts[category_counts > 1].index)]

print(f"✅ Filtered dataset now has {df['category_encoded'].nunique()} valid categories after removing rare ones.")

X_train, X_test, y_train, y_test = train_test_split(
    df['Question'].tolist(),
    df['category_encoded'],
    test_size=0.2,
    random_state=42,
    stratify=df['category_encoded']
)

print(f"✅ Data split into {len(X_train)} training and {len(X_test)} testing samples.")


✅ Filtered dataset now has 20 valid categories after removing rare ones.
✅ Data split into 158 training and 40 testing samples.


In [11]:
print("🔄 Loading Sentence Transformer model... (may take a few seconds)")
model_embed = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Model loaded successfully!")


🔄 Loading Sentence Transformer model... (may take a few seconds)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Model loaded successfully!


In [12]:
print("\nEncoding training and test questions...")
X_train_emb = model_embed.encode(X_train, convert_to_numpy=True, batch_size=32, show_progress_bar=True)
X_test_emb = model_embed.encode(X_test, convert_to_numpy=True, batch_size=32, show_progress_bar=True)
print("✅ Embeddings generated for training and test data!")



Encoding training and test questions...


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Embeddings generated for training and test data!


In [13]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_emb, y_train)
print("✅ Classifier trained successfully!")


✅ Classifier trained successfully!


In [15]:
y_pred = clf.predict(X_test_emb)
print("\n📈 Classification Report:")
present_labels = np.unique(y_test)
present_target_names = label_encoder.inverse_transform(present_labels)
print(classification_report(y_test, y_pred, labels=present_labels, target_names=present_target_names))
print("✅ Accuracy:", round(accuracy_score(y_test, y_pred) * 100, 2), "%")

print("✅ Accuracy:", round(accuracy_score(y_test, y_pred) * 100, 2), "%")



📈 Classification Report:
                          precision    recall  f1-score   support

              algorithms       1.00      0.50      0.67         2
                back-end       0.67      0.67      0.67         3
         data structures       0.00      0.00      0.00         2
        database and sql       0.00      0.00      0.00         2
        database systems       0.00      0.00      0.00         1
                  devops       0.38      0.75      0.50         4
     distributed systems       0.00      0.00      0.00         1
               front-end       0.43      1.00      0.60         3
              full-stack       0.00      0.00      0.00         2
     general programming       0.00      0.00      0.00         2
languages and frameworks       0.00      0.00      0.00         2
        machine learning       0.00      0.00      0.00         1
              networking       0.00      0.00      0.00         1
                security       1.00      0.67    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [17]:
# 🔍 Build Question Retrieval System
print("Building embeddings for all questions (this may take a while)...")
all_q_embeddings = model_embed.encode(df['Question'].tolist(), convert_to_numpy=True, batch_size=32, show_progress_bar=True)
print("✅ Embedding index ready!")

def get_questions_for_stack(stack_name, top_k=5):
    stack_name = stack_name.lower().strip()
    query_emb = model_embed.encode([f"Interview questions about {stack_name}"], convert_to_numpy=True)
    cosine_scores = util.cos_sim(query_emb, all_q_embeddings)[0].cpu().numpy()
    top_indices = cosine_scores.argsort()[-top_k:][::-1]

    print(f"\n💡 Top {top_k} questions related to '{stack_name}':\n")
    for idx in top_indices:
        q = df.iloc[idx]['Question']
        cat = df.iloc[idx]['Category']
        diff = df.iloc[idx]['Difficulty'] if 'difficulty' in df.columns else 'N/A'
        print(f"[{cat} | {diff}] {q}\n")


Building embeddings for all questions (this may take a while)...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

✅ Embedding index ready!


In [18]:
get_questions_for_stack('react')
get_questions_for_stack('python')
get_questions_for_stack('machine learning')



💡 Top 5 questions related to 'react':

[front-end | N/A] Can you explain the concept of 'state' in React?

[front-end | N/A] Can you explain the concept of 'state' in React?

[front-end | N/A] Explain the use of hooks in React.

[front-end | N/A] Explain the use of hooks in React.

[languages and frameworks | N/A] Discuss the role of a package manager like npm or pip.


💡 Top 5 questions related to 'python':

[languages and frameworks | N/A] What are the differences between Python 2 and Python 3?

[languages and frameworks | N/A] Discuss the role of a package manager like npm or pip.

[machine learning | N/A] Implement a natural language processing algorithm to understand and answer user queries.

[software testing | N/A] Describe the differences between manual and automated testing.

[low-level systems | N/A] Build a compiler for a new programming language.


💡 Top 5 questions related to 'machine learning':

[machine learning | N/A] Develop a machine learning model to predict stock p

In [19]:
def predict_category(new_question):
    new_emb = model_embed.encode([new_question], convert_to_numpy=True)
    pred = clf.predict(new_emb)
    category = label_encoder.inverse_transform(pred)[0]
    print(f"\n🧩 Predicted Tech Stack: {category}")

predict_category("Explain the difference between state and props in React.")



🧩 Predicted Tech Stack: front-end
