In [5]:
import praw
import pandas as pd
from textblob import TextBlob
import re
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Initialize Reddit instance
reddit = praw.Reddit(
    client_id="nNt1apRLcond1DKKcfhMJg",
    client_secret="OYprE9hUaZriB4lRixxH6iq3duoZXw",
    user_agent="script:networking_analysis:v1.0 (by /u/mirk01)"
)

# Keywords related to networking issues
networking_keywords = [
    'networking', 'network', 'connections', 'connect', 'referral', 'referrals',
    'linkedin', 'linkedin profile', 'professional network', 'reach out',
    'cold email', 'cold message', 'cold call', 'informational interview'
]

def is_networking_related(text):
    text = text.lower()
    return any(keyword in text for keyword in networking_keywords)

def extract_problems(text):
    # Common problem indicators
    problem_indicators = [
        'difficult', 'hard', 'trouble', 'struggle', 'problem', 'issue',
        'challenge', 'frustrated', 'anxious', 'nervous', 'afraid', 'scared',
        'don\'t know', 'confused', 'overwhelmed', 'stuck'
    ]
    
    # Split text into sentences
    sentences = re.split('[.!?]+', text.lower())
    
    # Extract sentences containing problem indicators
    problems = []
    for sentence in sentences:
        if any(indicator in sentence for indicator in problem_indicators):
            problems.append(sentence.strip())
    
    return problems

def scrape_reddit_posts(num_posts=1000):
    posts = []
    problems = []
    subreddit = reddit.subreddit('jobs')
    
    print(f"Scraping posts from r/jobs...")
    
    # Get posts from the last 30 days
    cutoff_date = datetime.utcnow() - timedelta(days=30)
    
    for post in subreddit.hot(limit=num_posts):
        # Check if post is from within our time window
        post_date = datetime.fromtimestamp(post.created_utc)
        if post_date < cutoff_date:
            continue
            
        # Check title and selftext
        if is_networking_related(post.title) or is_networking_related(post.selftext):
            posts.append({
                'title': post.title,
                'text': post.selftext,
                'score': post.score,
                'created_utc': post_date,
                'num_comments': post.num_comments
            })
            
            # Extract problems
            problems.extend(extract_problems(post.title))
            problems.extend(extract_problems(post.selftext))
            
            print(f"Found networking-related post {len(posts)}/{num_posts}")
            
            if len(posts) >= num_posts:
                break
    
    return posts, problems

def main():
    # Install required packages if not already installed
    try:
        import praw
        import pandas as pd
        from textblob import TextBlob
        import matplotlib.pyplot as plt
        import seaborn as sns
    except ImportError:
        print("Installing required packages...")
        import subprocess
        subprocess.check_call(['pip', 'install', 'praw', 'pandas', 'textblob', 'matplotlib', 'seaborn'])
        print("Packages installed successfully!")

    # Scrape posts
    print("Starting to scrape Reddit posts...")
    posts, problems = scrape_reddit_posts(1000)
    
    # Convert to DataFrame
    df = pd.DataFrame(posts)
    print(f"Total networking-related posts found: {len(df)}")
    
    if len(df) == 0:
        print("No networking-related posts were found. Please check the scraping process.")
        return
    
    # Print DataFrame columns for debugging
    print("\nDataFrame columns:", df.columns.tolist())
    
    # Analyze sentiment
    print("\nAnalyzing sentiment...")
    df['sentiment'] = df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
    
    # Plot sentiment distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(data=df, x='sentiment', bins=30)
    plt.title('Sentiment Distribution of Networking-Related Posts')
    plt.xlabel('Sentiment Score')
    plt.ylabel('Count')
    plt.savefig('sentiment_distribution.png')
    plt.close()
    
    print(f"Average sentiment: {df['sentiment'].mean():.3f}")
    
    # Analyze common problems
    print("\nAnalyzing common problems...")
    problem_counter = Counter(problems)
    top_5_problems = problem_counter.most_common(5)
    
    print("\nTop 5 Most Common Networking Problems:")
    for i, (problem, count) in enumerate(top_5_problems, 1):
        print(f"{i}. {problem} (mentioned {count} times)")
    
    # Plot top 5 problems
    plt.figure(figsize=(12, 6))
    problems, counts = zip(*top_5_problems)
    plt.bar(range(len(problems)), counts)
    plt.xticks(range(len(problems)), problems, rotation=45, ha='right')
    plt.title('Top 5 Most Common Networking Problems')
    plt.xlabel('Problem')
    plt.ylabel('Number of Mentions')
    plt.tight_layout()
    plt.savefig('top_problems.png')
    plt.close()
    
    # Save results
    print("\nSaving results...")
    df.to_csv('networking_posts.csv', index=False)
    
    with open('networking_problems.txt', 'w') as f:
        for problem, count in problem_counter.most_common():
            f.write(f"{problem}: {count}\n")
    
    print("Analysis complete! Results saved to:")
    print("- networking_posts.csv")
    print("- networking_problems.txt")
    print("- sentiment_distribution.png")
    print("- top_problems.png")

if __name__ == "__main__":
    main()

Starting to scrape Reddit posts...
Scraping posts from r/jobs...


  cutoff_date = datetime.utcnow() - timedelta(days=30)


Found networking-related post 1/1000
Found networking-related post 2/1000
Found networking-related post 3/1000
Found networking-related post 4/1000
Found networking-related post 5/1000
Found networking-related post 6/1000
Found networking-related post 7/1000
Found networking-related post 8/1000
Found networking-related post 9/1000
Found networking-related post 10/1000
Found networking-related post 11/1000
Found networking-related post 12/1000
Found networking-related post 13/1000
Found networking-related post 14/1000
Found networking-related post 15/1000
Found networking-related post 16/1000
Found networking-related post 17/1000
Found networking-related post 18/1000
Found networking-related post 19/1000
Found networking-related post 20/1000
Found networking-related post 21/1000
Found networking-related post 22/1000
Found networking-related post 23/1000
Found networking-related post 24/1000
Found networking-related post 25/1000
Found networking-related post 26/1000
Found networking-rela

  plt.tight_layout()



Saving results...
Analysis complete! Results saved to:
- networking_posts.csv
- networking_problems.txt
- sentiment_distribution.png
- top_problems.png


In [9]:
import praw
import pandas as pd
import numpy as np
from textblob import TextBlob
import re
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from nltk.util import ngrams
import string

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')

# Initialize Reddit instance
reddit = praw.Reddit(
    client_id="nNt1apRLcond1DKKcfhMJg",
    client_secret="OYprE9hUaZriB4lRixxH6iq3duoZXw",
    user_agent="script:job_hunting_analysis:v1.0 (by /u/YOUR_USERNAME)"
)

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    return ' '.join(tokens)

def extract_complaints(text):
    # Common complaint indicators
    complaint_indicators = [
        'difficult', 'hard', 'trouble', 'struggle', 'problem', 'issue',
        'challenge', 'frustrated', 'anxious', 'nervous', 'afraid', 'scared',
        'don\'t know', 'confused', 'overwhelmed', 'stuck', 'hate', 'terrible',
        'awful', 'worst', 'impossible', 'ridiculous', 'annoying', 'tired of',
        'sick of', 'fed up with', 'exhausted', 'drained', 'burned out'
    ]
    
    # Split text into sentences
    sentences = re.split('[.!?]+', text.lower())
    
    # Extract sentences containing complaint indicators
    complaints = []
    for sentence in sentences:
        if any(indicator in sentence for indicator in complaint_indicators):
            complaints.append(sentence.strip())
    
    return complaints

def get_top_ngrams(texts, n=2, top_n=10):
    # Create n-grams
    ngram_list = []
    for text in texts:
        tokens = word_tokenize(text)
        ngram_list.extend(list(ngrams(tokens, n)))
    
    # Count n-grams
    ngram_counts = Counter(ngram_list)
    
    # Get top n-grams
    return ngram_counts.most_common(top_n)

def analyze_topics(texts, n_topics=10, n_words=5):
    # Preprocess texts
    processed_texts = [preprocess_text(text) for text in texts]
    
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(
        max_features=1000,
        ngram_range=(1, 2),  # Include both unigrams and bigrams
        stop_words='english'
    )
    
    # Create document-term matrix
    dtm = vectorizer.fit_transform(processed_texts)
    
    # Apply Non-negative Matrix Factorization for topic modeling
    nmf = NMF(n_components=n_topics, random_state=42)
    nmf.fit(dtm)
    
    # Get feature names (words)
    feature_names = vectorizer.get_feature_names_out()
    
    # Get top words for each topic
    topics = []
    for topic_idx, topic in enumerate(nmf.components_):
        top_words_idx = topic.argsort()[:-n_words-1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics.append(top_words)
    
    return topics

def scrape_reddit_posts():
    posts = []
    complaints = []
    subreddit = reddit.subreddit('jobs')
    
    print(f"Scraping posts from r/jobs...")
    
    # Get posts from the last year
    cutoff_date = datetime.utcnow() - timedelta(days=365)
    
    # Get posts from different time periods to ensure we get enough data
    for time_filter in ['hot', 'top', 'new']:
        print(f"\nFetching {time_filter} posts...")
        for post in getattr(subreddit, time_filter)(limit=None):
            # Check if post is from within our time window
            post_date = datetime.fromtimestamp(post.created_utc)
            if post_date < cutoff_date:
                continue
            
            # Extract complaints
            post_complaints = extract_complaints(post.title) + extract_complaints(post.selftext)
            
            if post_complaints:  # Only include posts with complaints
                posts.append({
                    'title': post.title,
                    'text': post.selftext,
                    'score': post.score,
                    'created_utc': post_date,
                    'num_comments': post.num_comments,
                    'complaints': post_complaints
                })
                
                for complaint in post_complaints:
                    complaints.append({
                        'text': complaint,
                        'score': post.score
                    })
                
                print(f"Found relevant post {len(posts)} (from {post_date.strftime('%Y-%m-%d')})")
    
    return posts, complaints

def analyze_complaints(complaints):
    # Convert complaints to DataFrame
    df_complaints = pd.DataFrame(complaints)
    
    # Get all complaint texts
    complaint_texts = df_complaints['text'].tolist()
    
    # Analyze topics
    print("\nAnalyzing topics...")
    topics = analyze_topics(complaint_texts)
    
    # Get top bigrams
    print("\nAnalyzing bigrams...")
    top_bigrams = get_top_ngrams(complaint_texts, n=2)
    
    return topics, top_bigrams

def main():
    # Install required packages if not already installed
    try:
        import praw
        import pandas as pd
        from textblob import TextBlob
        import matplotlib.pyplot as plt
        import seaborn as sns
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.decomposition import NMF
        import nltk
    except ImportError:
        print("Installing required packages...")
        import subprocess
        subprocess.check_call(['pip', 'install', 'praw', 'pandas', 'textblob', 'matplotlib', 'seaborn', 'scikit-learn', 'nltk'])
        print("Packages installed successfully!")

    # Scrape posts
    print("Starting to scrape Reddit posts...")
    posts, complaints = scrape_reddit_posts()
    
    # Convert posts to DataFrame
    df_posts = pd.DataFrame(posts)
    print(f"\nTotal relevant posts found: {len(df_posts)}")
    
    if len(df_posts) == 0:
        print("No relevant posts were found. Please check the scraping process.")
        return
    
    # Analyze complaints
    topics, top_bigrams = analyze_complaints(complaints)
    
    # Create summary DataFrame for topics
    topic_data = []
    for i, topic in enumerate(topics, 1):
        topic_data.append({
            'Topic': f'Topic {i}',
            'Top Words': ', '.join(topic)
        })
    
    topic_df = pd.DataFrame(topic_data)
    
    # Create summary DataFrame for bigrams
    bigram_data = []
    for bigram, count in top_bigrams:
        bigram_data.append({
            'Bigram': ' '.join(bigram),
            'Count': count
        })
    
    bigram_df = pd.DataFrame(bigram_data)
    
    # Print results
    print("\nTop Topics:")
    print("===========")
    print(topic_df.to_string(index=False))
    
    print("\nTop Bigrams:")
    print("============")
    print(bigram_df.to_string(index=False))
    
    # Plot results
    plt.figure(figsize=(12, 6))
    sns.barplot(data=bigram_df, x='Bigram', y='Count')
    plt.title('Top Bigrams in Job Hunting Complaints')
    plt.xlabel('Bigram')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('complaint_bigrams.png')
    plt.close()
    
    # Save results
    print("\nSaving results...")
    df_posts.to_csv('job_hunting_posts.csv', index=False)
    topic_df.to_csv('topic_analysis.csv', index=False)
    bigram_df.to_csv('bigram_analysis.csv', index=False)
    
    # Save detailed summary
    with open('complaint_analysis.txt', 'w') as f:
        f.write("Job Hunting Complaints Analysis (Last Year)\n")
        f.write("========================================\n\n")
        f.write("Top Topics:\n")
        f.write("----------\n\n")
        for _, row in topic_df.iterrows():
            f.write(f"{row['Topic']}:\n")
            f.write(f"- Top Words: {row['Top Words']}\n\n")
        
        f.write("\nTop Bigrams:\n")
        f.write("------------\n\n")
        for _, row in bigram_df.iterrows():
            f.write(f"- {row['Bigram']}: {row['Count']} occurrences\n")
    
    print("\nAnalysis complete! Results saved to:")
    print("- job_hunting_posts.csv")
    print("- topic_analysis.csv")
    print("- bigram_analysis.csv")
    print("- complaint_analysis.txt")
    print("- complaint_bigrams.png")

if __name__ == "__main__":
    main()

Starting to scrape Reddit posts...
Scraping posts from r/jobs...

Fetching hot posts...


  cutoff_date = datetime.utcnow() - timedelta(days=365)


Found relevant post 1 (from 2025-03-22)
Found relevant post 2 (from 2025-03-22)
Found relevant post 3 (from 2025-03-21)
Found relevant post 4 (from 2025-03-22)
Found relevant post 5 (from 2025-03-21)
Found relevant post 6 (from 2025-03-21)
Found relevant post 7 (from 2025-03-22)
Found relevant post 8 (from 2025-03-22)
Found relevant post 9 (from 2025-03-22)
Found relevant post 10 (from 2025-03-22)
Found relevant post 11 (from 2025-03-22)
Found relevant post 12 (from 2025-03-22)
Found relevant post 13 (from 2025-03-22)
Found relevant post 14 (from 2025-03-22)
Found relevant post 15 (from 2025-03-22)
Found relevant post 16 (from 2025-03-22)
Found relevant post 17 (from 2025-03-22)
Found relevant post 18 (from 2025-03-22)
Found relevant post 19 (from 2025-03-22)
Found relevant post 20 (from 2025-03-21)
Found relevant post 21 (from 2025-03-21)
Found relevant post 22 (from 2025-03-22)
Found relevant post 23 (from 2025-03-22)
Found relevant post 24 (from 2025-03-22)
Found relevant post 25 (f

In [10]:
import requests

token = 'EMMSKJGCMIW7BSU47IX7'
headers = {
    'Authorization': f'Bearer {token}'
}

params = {
    'q': 'tech',
    'location.address': 'Toronto',
    'sort_by': 'date',
    'start_date.range_start': '2025-04-01T00:00:00Z',
    'start_date.range_end': '2025-04-30T23:59:59Z'
}

response = requests.get('https://www.eventbriteapi.com/v3/events/search/', headers=headers, params=params)

data = response.json()

for event in data['events']:
    print(f"Name: {event['name']['text']}")
    print(f"Start: {event['start']['local']}")
    print(f"URL: {event['url']}")
    print('-' * 40)


KeyError: 'events'