In [3]:

from typing import List
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import numpy as np
import ollama
from typing import Any, List, Dict
import pandas as pd
import time
import praw
import requests
from IPython.display import display, HTML

def create_embeddings_for_text(text: str) -> List[float]:
    """
    Create embeddings for the given text using a local model
    """
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )
    return embeddings.embed_query(text)

def split_and_embed_text(text: str) -> tuple:
    """
    Split text into chunks and create embeddings
    """
    # Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        separators=["\n\n", "\n", " ", ""]
    )
    chunks = text_splitter.split_text(text)
    
    # Create embeddings for chunks
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )
    
    # Create vector store
    vectorstore = FAISS.from_texts(chunks, embeddings)
    
    return chunks, vectorstore

def summarize_pain_point(text: str, model="qwen2.5:7b") -> str:
    """
    Use RAG to summarize the pain point from a post
    """
    # Split and embed the text
    chunks, vectorstore = split_and_embed_text(text)
    
    # Create query embedding and find most relevant chunks
    query = "What is the main problem or pain point discussed?"
    relevant_chunks = vectorstore.similarity_search(query, k=2)
    
    # Combine relevant chunks
    context = "\n".join([chunk.page_content for chunk in relevant_chunks])
    
    prompt = f"""
    Based on the following relevant excerpts from a Reddit post, identify and summarize 
    the main pain point or problem the user is experiencing in 1-2 sentences.
    If there's a potential business opportunity or startup idea to solve this problem, 
    briefly mention it. If no clear pain point exists, respond with "No clear pain point identified."
    
    RELEVANT EXCERPTS:
    {context}
    
    PAIN POINT SUMMARY:
    """
    
    try:
        response = ollama.generate(model=model, prompt=prompt)
        return response['response'].strip()
    except Exception as e:
        return f"Error: {str(e)}"



In [4]:
subreddit = "devops"
reddit = praw.Reddit(
    client_id="Bz7Ry81mv1huFSpT8EZ9Og",        # Replace with your client ID
    client_secret="k2Eg9bl_ulMzfo40HahGNquMLWoBcg", # Replace with your client secret
    user_agent="web app by /u/zoe-zyn"
)
posts = get_reddit_posts_praw(reddit, subreddit, limit=30)

NameError: name 'get_reddit_posts_praw' is not defined

In [20]:
posts[0].comments[0].body

'Just curious, how long was it until you caught the mistake and brought it up to them?'

In [3]:
# from typing import Any, List, Dict
# import pandas as pd
# import time
# import praw
# import requests
# from IPython.display import display, HTML

# # Reddit API setup using PRAW
# def setup_reddit() -> praw.Reddit:
#     """
#     Set up and return a Reddit instance using PRAW
#     Note: For a real application, you should use environment variables or a config file
#     """
#     reddit = praw.Reddit(
#         client_id="Bz7Ry81mv1huFSpT8EZ9Og",        # Replace with your client ID
#         client_secret="k2Eg9bl_ulMzfo40HahGNquMLWoBcg", # Replace with your client secret
#         user_agent="web app by /u/zoe-zyn"
#     )
#     return reddit

# # Function to get posts from a subreddit using PRAW
# def get_reddit_posts_praw(reddit, subreddit_name, limit=20, timeframe='week') -> List[praw.models.Submission]:
#     """
#     Fetch posts from a subreddit using PRAW
#     """
#     subreddit = reddit.subreddit(subreddit_name)
    
#     # # Get top posts based on timeframe
#     # if timeframe == 'day':
#     #     posts = subreddit.top('day', limit=limit)
#     # elif timeframe == 'week':
#     #     posts = subreddit.top('week', limit=limit)
#     # elif timeframe == 'month':
#     #     posts = subreddit.top('month', limit=limit)
#     # elif timeframe == 'year':
#     #     posts = subreddit.top('year', limit=limit)
#     # elif timeframe == 'all':
#     #     posts = subreddit.top('all', limit=limit)
#     # else:
#     #     posts = subreddit.top('week', limit=limit)

#     posts = subreddit.search("complain OR issue OR problem", limit=limit)
    
#     return list(posts)

# # # Function to filter posts that likely contain complaints
# # def filter_complaint_posts(posts) -> List[praw.models.Submission]:
# #     """
# #     Filter posts that likely contain complaints based on keywords and patterns
# #     """
# #     complaint_keywords = [
# #         'problem', 'issue', 'hate', 'annoying', 'frustrated', 'disappointing',
# #         'terrible', 'awful', 'wish', 'should', 'need to', 'can\'t stand',
# #         'difficult', 'struggle', 'pain', 'annoyed', 'tired of', 'sick of',
# #         'why isn\'t there', 'why can\'t', 'doesn\'t work', 'broken'
# #     ]
    
# #     filtered_posts = []
    
# #     for post in posts:
# #         title = post.title.lower()
# #         selftext = post.selftext.lower() if hasattr(post, 'selftext') else ''
        
# #         # Check if any complaint keywords are in the title or text
# #         if any(keyword in title or keyword in selftext for keyword in complaint_keywords):
# #             filtered_posts.append(post)
    
# #     return filtered_posts


# def summarize_pain_point1(text, model="qwen2.5:7b") -> str:
#     """
#     Use Ollama to summarize the pain point from a post
#     """
#     # Truncate text if it's too long (many models have context limits)
#     max_length = 4000
#     if len(text) > max_length:
#         text = text[:max_length] + "..."
    
#     prompt = f"""
#     The following is a post from Reddit. Identify and summarize the main pain point or problem the user is experiencing in 1-2 sentences.
#     If there's a potential business opportunity or startup idea to solve this problem, briefly mention it.
#     If no clear pain point exists, respond with "No clear pain point identified."
    
#     POST:
#     {text}
    
#     PAIN POINT SUMMARY:
#     """
    
#     try:
#         response = ollama.generate(model=model, prompt=prompt)
#         return response['response'].strip()
#     except Exception as e:
#         return f"Error: {str(e)}"

# # Main function to analyze subreddits
# def analyze_subreddit_pain_points(subreddits: List[str], posts_per_subreddit: int = 5) -> List[Dict[str, Any]]:
# # def analyze_subreddit_pain_points(subreddits, posts_per_subreddit=5):
#     """
#     Analyze pain points from multiple subreddits using PRAW
#     """
#     all_results = []
    
#     # Set up Reddit API
#     try:
#         reddit = setup_reddit()
#         print("Successfully connected to Reddit API")
#     except Exception as e:
#         print(f"Error setting up Reddit API: {str(e)}")
#         return []
    
#     for subreddit in subreddits:
#         print(f"Analyzing r/{subreddit}...")
        
#         try:
#             # Get posts from subreddit
#             posts = get_reddit_posts_praw(reddit, subreddit, limit=30)
            
#             # Filter for complaint posts
#             complaint_posts = posts
#             # complaint_posts = filter_complaint_posts(posts)
            
#             # Limit to requested number of posts
#             complaint_posts = complaint_posts[:posts_per_subreddit]
            
#             for post in complaint_posts:
#                 # Combine title and text for analysis
#                 full_text = f"Title: {post.title}\n\nContent: {post.selftext if hasattr(post, 'selftext') else '[No content]'}"
                
#                 # Summarize the pain point
#                 summary = summarize_pain_point(full_text)
                
#                 # Add to results
#                 result = {
#                     'subreddit': subreddit,
#                     'title': post.title,
#                     'url': f"https://www.reddit.com{post.permalink}",
#                     'score': post.score,
#                     'num_comments': post.num_comments,
#                     'pain_point_summary': summary
#                 }
                
#                 all_results.append(result)
                
#                 # Avoid rate limiting
#                 time.sleep(1)
                
#         except Exception as e:
#             print(f"Error processing r/{subreddit}: {str(e)}")
    
#     return all_results

# # Function to display results in a nice format
# def display_results(results):
#     """
#     Display the results in a formatted table
#     """
#     if not results:
#         print("No results to display.")
#         return None
        
#     df = pd.DataFrame(results)
    
#     # Create clickable links
#     df['title'] = df.apply(lambda row: f"<a href='{row['url']}' target='_blank'>{row['title']}</a>", axis=1)
    
#     # Select and reorder columns for display
#     display_df = df[['subreddit', 'title', 'score', 'num_comments', 'pain_point_summary']]
#     display_df.columns = ['Subreddit', 'Post Title', 'Score', 'Comments', 'Pain Point Summary']
    
#     # Display as HTML
#     display(HTML(display_df.to_html(escape=False)))
    
#     return df  # Return the full dataframe for further processing

# Test the functionality with PRAW
subreddits_to_analyze = [
    # 'productivity',
    # 'freelance',
    # 'smallbusiness',
    # 'parenting',
    # 'fitness'
    'devops'
]

# Run the analysis
print("Starting Reddit pain point analysis...")
results = analyze_subreddit_pain_points(subreddits_to_analyze, posts_per_subreddit=10)

# Display the results
if results:
    df_results = display_results(results)
    
    # Save results to CSV for later use
    df_results.to_csv('reddit_pain_points.csv', index=False)
    
    print(f"Analysis complete! Found {len(results)} potential pain points across {len(subreddits_to_analyze)} subreddits.")
else:
    print("Analysis failed or returned no results.")

Starting Reddit pain point analysis...
Successfully connected to Reddit API
Analyzing r/devops...


  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,Subreddit,Post Title,Score,Comments,Pain Point Summary
0,devops,Beware of GitLab billing issues,167,44,"The user is experiencing frustration with poor customer support from their GitLab account manager regarding an alleged billing mistake. This issue highlights a potential business opportunity for a startup that could provide comprehensive support services specifically addressing the needs of GitLab users, offering quicker resolution and more attentive service compared to what the user encountered."
1,devops,/r/devops will be going dark on the 12th,762,112,"The main pain point is the user's frustration with Reddit's API issues, which have caused significant disruption, and the perceived lack of action from the platform to resolve these problems.\n\nNo clear business opportunity directly emerges from this specific excerpt, but there could be a potential startup idea focused on monitoring and alerting services for APIs or platforms, helping users mitigate disruptions by providing alternative solutions or tools during such incidents."
2,devops,When load tests hide the real issues and lead to crashes,2,0,"The user is experiencing pain points related to load testing hiding real issues, making it difficult to identify and fix bottlenecks and bugs in high-stability systems. This can lead to crashes despite passing initial load tests.\n\nPotential business opportunity: A startup could develop advanced load testing tools or services that accurately simulate real-world scenarios to uncover hidden issues before they cause problems in production environments."
3,devops,finding OS level virtualization solutions,0,27,"The user is experiencing difficulties finding a simple, sandboxed environment where they can test web applications locally and easily sync changes to a remote server without issues related to Docker, chroot, LXC, or FreeBSD jails. This suggests a potential business opportunity for a lightweight, reliable OS-level virtualization solution that addresses these specific needs."
4,devops,Why do internal development platform technologies feel half-baked?,28,35,"The user is experiencing frustration with a product that has been consistently poorly designed and documented for two years, despite shifting priorities, and is also seen as fundamentally flawed due to the involvement of middlemen. This suggests a potential business opportunity in creating a more streamlined, well-documented, and direct alternative to this process."
5,devops,is monitoring Kafka hard for you? Looking for feedback on some features for better monitoring and troubleshooting Kafka,10,7,"The user is experiencing difficulty in understanding and remedying issues within messaging systems, which poses a challenge for improving the reliability and efficiency of such systems. This pain point could be addressed through a startup that develops intuitive tools or platforms specifically designed to diagnose and resolve messaging system issues more effectively.\n\n**PAIN POINT SUMMARY:** The main issue is the difficulty in diagnosing and resolving problems within messaging systems, making it challenging to ensure their reliable operation.\n\n**Potential Business Opportunity:** A platform that provides real-time diagnostics, troubleshooting guides, and collaborative support for identifying and fixing issues in messaging systems could fill this need."
6,devops,Terraform Does Not Import,5,2,"No clear pain point identified.\n\nThe provided excerpts do not contain any specific issues or problems that the user is experiencing. The post seems to indicate a journey has been completed with support from others, but no detailed pain points are mentioned."
7,devops,Need help trying to make POST request from my static web app on Github Pages to SpringBoot on EC2 via HTTPS,0,3,"Pain Point Summary: The user is struggling to perform HTTPS requests from their static web app hosted on Github Pages to a SpringBoot application running on an EC2 instance due to Github Pages not supporting HTTP.\n\nPotential Business Opportunity/Startup Idea: A service that automatically handles SSL/TLS termination and routing for requests between static sites hosted on platforms like Github Pages and backend services running on cloud providers such as AWS, simplifying cross-origin request configurations."
8,devops,"Not every ""Getting into DevOps"" topic is fruitless.",0,6,"Pain Point Summary: The user identifies a common frustration of having to navigate through a long list of things to learn in DevOps, which can be overwhelming and may discourage beginners from even starting. This discouragement could lead them to give up before they have the chance to explore topics that might not involve easy answers.\n\nPotential Business Idea: A platform or course that offers bite-sized, practical modules on various DevOps tools and concepts, providing a structured yet flexible learning path to keep users motivated and engaged without feeling overwhelmed."
9,devops,PLEASE stop shoehorning devops where it doesn't belong OR WHERE YOU AREN'T READY FOR IT,6,15,"Pain Point Summary: The user is frustrated with the expectation that infrastructure and operations teams should adopt methodologies like Agile, Scrum, and DevOps in the same way as software development teams, despite the fundamental differences between managing hardware and software.\n\nPotential Business Opportunity: A startup could offer tailored training and consulting services that help organizations understand and implement methodologies suited for infrastructure and operations teams, focusing on practical, realistic solutions rather than a one-size-fits-all approach."


Analysis complete! Found 10 potential pain points across 1 subreddits.


In [5]:
from get_data import get_posts_from_subreddits
results = get_posts_from_subreddits(subreddits=["devops"],search_limit=50)

Successfully connected to Reddit API
Getting posts from r/devops...


In [9]:
len(results)

50

In [20]:
# Create embeddings for all pain point summaries
from bertopic import BERTopic

all_post_contents = [post['content'] for post in results]

print("all_post_contents", all_post_contents)

# Initialize BERTopic
topic_model = BERTopic(
    embedding_model="all-MiniLM-L6-v2",  # Same model as we used for RAG
    # min_topic_size=2,  # Minimum number of documents per topic
    # nr_topics='auto'   # Automatically determine number of topics
)

topics, probs = topic_model.fit_transform(all_post_contents)



In [23]:
topic_model.get_topic(1)

[('java', np.float64(0.09199520994249995)),
 ('at', np.float64(0.0673510560936753)),
 ('the', np.float64(0.06683542424010597)),
 ('org', np.float64(0.061178038053668206)),
 ('to', np.float64(0.04832994534152003)),
 ('as', np.float64(0.04463335871999558)),
 ('sonar', np.float64(0.03888396583293345)),
 ('ce', np.float64(0.03683020375345071)),
 ('ceworkerimpl', np.float64(0.03543926781727321)),
 ('that', np.float64(0.034373184577716265))]

In [None]:
topics

In [26]:
from typing import Dict, List
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import pandas as pd
from IPython.display import display, HTML
import ollama

def categorize_posts(posts_data: List[Dict], model="qwen2.5:7b") -> Dict:
    """
    Group similar pain points into categories and summarize each category
    """
    # Create embeddings for all pain point summaries
    all_post_contents = [post['content'] for post in posts_data]
    # chunks, vectorstore = split_and_embed_text("\n".join(all_summaries))
    
    # # Use clustering to group similar pain points
    # embeddings = [create_embeddings_for_text(summary) for summary in all_summaries]
    # from sklearn.cluster import KMeans
    
    # # Convert embeddings to numpy array
    # embeddings_array = np.array(embeddings)
    
    # # Determine number of clusters (you might want to tune this)
    # n_clusters = min(5, len(embeddings))
    # kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    # clusters = kmeans.fit_predict(embeddings_array)

    print("all_post_contents", all_post_contents)
    
    # Initialize BERTopic
    topic_model = BERTopic(
        embedding_model="all-MiniLM-L6-v2",  # Same model as we used for RAG
        # min_topic_size=2,  # Minimum number of documents per topic
        # nr_topics='auto'   # Automatically determine number of topics
    )
    
    # Fit the model and get topics
    topics, probs = topic_model.fit_transform(all_post_contents) # topics is a list of cluster numbers: [0,-1,0,1,0,0,..]
    
    # Group posts by cluster
    categorized_posts = {}
    for i, cluster in enumerate(topics):
        if cluster not in categorized_posts:
            categorized_posts[cluster] = []
        categorized_posts[cluster].append(posts_data[i])
    return categorized_posts

def summarize_pain_points(categorized_posts: Dict, model="qwen2.5:7b") -> Dict:
    
    # Summarize each category
    categories = {}
    for cluster, posts in categorized_posts.items():
        # # Combine all pain point summaries in this cluster
        # cluster_summaries = [post['pain_point_summary'] for post in posts]
        # combined_summary = "\n".join(cluster_summaries)
        if cluster == -1:  # Skip outlier topic
            continue
            
        # # Get topic words and their scores
        # topic_words = [word for word, _ in topic_model.get_topic(cluster)]
        # topic_words_str = ", ".join(topic_words[:5])  # Top 5 words
        
        # Combine all pain point summaries in this cluster
        # post_contents = 
        post_contents = "\n".join([post['content'] for post in posts])
        
        # prompt = f"""
        # Based on the following group of related pain points, provide:
        # 1. A category name that best describes these related issues (one short phrase)
        # 2. A summary of the common themes and problems in this category (2-3 sentences)

        # Pain points in this group:
        # {post_contents}
        
        # Format your response as:
        # Category: [category name]
        # Summary: [summary of common themes]
        # """
        prompt = f"""
        Based on these related posts from r/devops, identify the common pain point or problem these users are experiencing.

        Posts:
        {post_contents}

        Format your response as:
        Category: [A category name that best describes these related issues]
        Pain Points: [2-4 sentences summarizing the shared problems]

        There should be only one category and one pain point.
        """ 
        try:
            response = ollama.generate(model=model, prompt=prompt)
            category_info = response['response'].strip()
            
            categories[cluster] = {
                'category_info': category_info, #ouput from LLM
                'posts': posts,
                # 'topic_words': topic_words_str
            }
        except Exception as e:
            print(f"Error summarizing cluster {cluster}: {str(e)}")
    
    return categories

def display_categorized_results(categories: Dict):
    """
    Display the categorized results in a formatted HTML output
    """
    html_output = "<div style='font-family: Arial, sans-serif;'>"
    
    for cluster, data in categories.items():
        category_info = data['category_info']
        posts = data['posts']
        
        html_output += f"<div style='margin-bottom: 30px; padding: 15px; border: 1px solid #ddd; border-radius: 5px;'>"
        html_output += f"<pre style='white-space: pre-wrap;'>{category_info}</pre>"
        html_output += "<h4>Posts in this category:</h4>"
        html_output += "<ul>"
        
        for post in posts:
            html_output += f"""
            <li>
                <a href='{post["url"]}' target='_blank'>{post["title"]}</a>
                <br>
                <small>Score: {post["score"]} | Comments: {post["num_comments"]}</small>
                <br>
                <em>Content: {post["content"]}</em>
            </li>
            """
        
        html_output += "</ul></div>"
    
    html_output += "</div>"
    display(HTML(html_output))

# Modify the main analysis flow

# After getting results from analyze_subreddit_pain_points:
if results:
    print("Categorizing pain points...")
    categorized_posts = categorize_posts(results)
    categories = summarize_pain_points(categorized_posts)
    
    print("Displaying categorized results...")
    display_categorized_results(categories)
    
    # Save results to CSV with category information
    categorized_df = []
    for cluster, data in categories.items():
        category_info = data['category_info']
        for post in data['posts']:
            post_data = post.copy()
            post_data['category'] = category_info
            categorized_df.append(post_data)
    
    df_results = pd.DataFrame(categorized_df)
    df_results.to_csv('reddit_pain_points_categorized.csv', index=False)
    
    print(f"Analysis complete! Found {len(results)} potential pain points across {len(categories)} categories.")
else:
    print("Analysis failed or returned no results.")

Categorizing pain points...
Displaying categorized results...


Analysis complete! Found 50 potential pain points across 2 categories.


In [25]:
categories

{0: {'category_info': "### Category: Learning Curve and Skill Mismatch\n\n### Pain Points:\n1. **Inadequate Preparation**: Junior engineers often face challenges because they may not have a comprehensive understanding of the systems they are working on, leading to frequent mistakes and delays.\n2. **Skill Gaps**: There's a mismatch between the theoretical knowledge junior engineers learn in their courses and the practical skills required in real-world projects, which can lead to confusion and inefficiencies.\n\n---\n\n### Category: Inadequate Documentation and Knowledge Transfer\n\n### Pain Points:\n1. **Lack of Clear Documentation**: Poorly documented systems make it difficult for new team members to understand how everything works, leading to errors and wasted time.\n2. **Knowledge Silos**: When critical knowledge is held by a few experienced members who don't share their expertise, it creates bottlenecks when those individuals are unavailable or leave the company.\n\n---\n\n### Cate

In [10]:
results

[{'subreddit': 'devops',
  'title': 'Beware of GitLab billing issues',
  'url': 'https://www.reddit.com/r/devops/comments/t0qizc/beware_of_gitlab_billing_issues/',
  'score': 167,
  'num_comments': 44,
  'pain_point_summary': 'The user is experiencing frustration with poor customer support from their GitLab account manager regarding an alleged billing mistake. This issue highlights a potential business opportunity for a startup that could provide comprehensive support services specifically addressing the needs of GitLab users, offering quicker resolution and more attentive service compared to what the user encountered.'},
 {'subreddit': 'devops',
  'title': '/r/devops will be going dark on the 12th',
  'url': 'https://www.reddit.com/r/devops/comments/143jc6m/rdevops_will_be_going_dark_on_the_12th/',
  'score': 762,
  'num_comments': 112,
  'pain_point_summary': "The main pain point is the user's frustration with Reddit's API issues, which have caused significant disruption, and the per

In [None]:
import praw

# Initialize Reddit API client
reddit = praw.Reddit(
    client_id="Bz7Ry81mv1huFSpT8EZ9Og",
    client_secret="k2Eg9bl_ulMzfo40HahGNquMLWoBcg",
    user_agent="web app by /u/zoe-zyn"
)

# Fetch posts from a subreddit
subreddit = reddit.subreddit("devops")
complaint_posts = []

# Search for posts with keywords like "complain", "issue", "problem", etc.
for post in subreddit.search("complain OR issue OR problem", limit=10):
    complaint_posts.append({
        "title": post.title,
        "body": post.selftext,
        "url": post.url
    })

print(complaint_posts)



In [4]:
import praw

# Initialize Reddit API client
reddit = praw.Reddit(
    client_id="Bz7Ry81mv1huFSpT8EZ9Og",
    client_secret="k2Eg9bl_ulMzfo40HahGNquMLWoBcg",
    user_agent="web app by /u/zoe-zyn"
)

# Fetch posts from a subreddit
subreddit = reddit.subreddit("devops")
complaint_posts = []

# Search for posts with keywords like "complain", "issue", "problem", etc.
for post in subreddit.search("complain OR issue OR problem", limit=10):
    complaint_posts.append({
        "title": post.title,
        "body": post.selftext,
        "url": post.url
    })

print(complaint_posts)

