In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK stopwords if you haven't already
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# Load data
df = pd.read_csv('train.csv', delimiter=',', on_bad_lines='skip')

In [3]:
# Combine Title and Body for topic modeling
df['text'] = df['Title'] + " " + df['Body']

In [4]:
# Preprocessing function with NLTK
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords and non-alphabetic tokens
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    # Rejoin tokens into a single string
    return ' '.join(tokens)

# Apply preprocessing
df['text'] = df['text'].apply(preprocess_text)


In [5]:
# Vectorize the text data
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
text_matrix_tfidf = tfidf_vectorizer.fit_transform(df['text'])

# LDA Model
n_topics = 10  # Choose the number of topics
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=0)
lda_model.fit(text_matrix_tfidf)

In [8]:
# Display the topics with keywords
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}: ", end='')
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

display_topics(lda_model, tfidf_vectorizer.get_feature_names_out(), 10)

# Transform the documents to their topic distributions
topic_distributions = lda_model.transform(text_matrix_tfidf)

# Function to recommend similar questions
def recommend_similar_questions(question_id, top_n=5):
    # Find the topic distribution for the given question
    question_index = df[df['Id'] == question_id].index[0]
    question_topic_dist = topic_distributions[question_index]

    # Compute similarity with other questions
    similarities = cosine_similarity([question_topic_dist], topic_distributions)[0]

    # Get indices of the top similar questions
    similar_indices = similarities.argsort()[-top_n-1:-1][::-1]

    # Display similar questions
    print(f"Recommendations for Question ID {question_id}:\n")
    for idx in similar_indices:
        print(f"Question ID: {df.iloc[idx]['Id']}")
        print(f"Title: {df.iloc[idx]['Title']}")
        print(f"Tags: {df.iloc[idx]['Tags']}\n")

Topic 1: token request webpack firebase url server pi domain error using
Topic 2: int public class void string return new codepre code static
Topic 3: component react import gt codepre const export angular return render
Topic 4: array string codepre want like list value pi data number
Topic 5: date sql query table select join format database insert month
Topic 6: func let cell sheet nil tableview sheets iboutlet viewcontroller indexpath
Topic 7: npm ltdependencygt err wsystemerr verbose precodenpm audio gyp installcode typings
Topic 8: branch git commit commits branches merge codegit checkout master svn
Topic 9: pi file using image app codepre use android error like
Topic 10: div html button echo form page ltdiv var php ltdivgt


In [9]:
# Test recommendation function
recommend_similar_questions(question_id=34552656)

Recommendations for Question ID 34552656:

Question ID: 55105647
Title: Something is wrong with my pyhton write to interact game code
Tags: <python>

Question ID: 58089057
Title: Is there a way of extracting xpath and text from webpage and save to file?
Tags: <c#><html><selenium><dictionary><xpath>

Question ID: 50163300
Title: Filtering data with Select (advanced)
Tags: <php><html><mysql><filter>

Question ID: 37686124
Title: perl search and replace with Tie
Tags: <perl>

Question ID: 51463625
Title: \n doesn't work in Python 3.6
Tags: <python><jupyter-notebook>

