# Retrieving data

## Reddit

In [None]:
!pip install praw

In [None]:
import praw
import pandas as pd
import time
from datetime import datetime
import string

### Posts

In [None]:
# Function to fetch 1000 recent posts from the specified subreddit
def fetch_posts(subreddit_name):
    posts_data = []
    try:
        for submission in reddit.subreddit(subreddit_name).new(limit=1000):
            post = {
                'title': submission.title,
                'score': submission.score,
                'id': submission.id,
                'subreddit': str(submission.subreddit),
                'url': submission.url,
                'num_comments': submission.num_comments,
                'selftext': submission.selftext,
                'created_utc': datetime.utcfromtimestamp(submission.created_utc)
            }
            posts_data.append(post)
            # Sleep to avoid hitting the rate limit
            time.sleep(1.2)  # Sleeping a bit more than 1 sec to stay within the limit
    except praw.exceptions.APIException as e:
        if e.error_type == "TooManyRequests":
            print("Rate limit exceeded. Retrying in 60 seconds.")
            time.sleep(60)
            return fetch_posts(subreddit_name)  # Retry
        else:
            print(f"An error occurred: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    return posts_data

In [None]:
def save_to_excel(posts, subreddit_name, file_path):
    if not posts:
        print("No posts to save.")
        return

    df = pd.DataFrame(posts)

    filename = f"{file_path}{subreddit_name}_{datetime.now().strftime('%Y%m%d%H%M%S')}.xlsx"
    df.to_excel(filename, index=False)
    print(f"Saved {len(posts)} posts from /r/{subreddit_name} to {filename}")

In [None]:
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent,
    username=username,
    password=password)

In [1]:
# Using subreddit name
subreddit_name = 'aiwars'
file_path = file_path

In [None]:
posts = fetch_posts(subreddit_name)
if posts:
    save_to_excel(posts, subreddit_name, file_path)

In [None]:
# Function to fetch 1000 recent posts from the specified subreddit and flair
def fetch_ai_posts(subreddit_name, flair):
    posts_data = []
    search_query = f'flair:"{flair}"'
    try:
        for submission in reddit.subreddit(subreddit_name).search(search_query, sort='hot', syntax='lucene', limit=1000):
            post = {
                'title': submission.title,
                'score': submission.score,
                'id': submission.id,
                'subreddit': str(submission.subreddit),
                'url': submission.url,
                'num_comments': submission.num_comments,
                'selftext': submission.selftext,
                'created_utc': datetime.utcfromtimestamp(submission.created_utc)
            }
            posts_data.append(post)
    except Exception as e:
        print(f"An error occurred: {e}")
    return posts_data

In [None]:
subreddit_name = "Futurology"
flair = "AI"

In [None]:
posts = fetch_ai_posts(subreddit_name, flair)
if posts:
    save_to_excel(posts, subreddit_name, file_path)

### Comments

In [None]:
# Define illegal characters or use a condition to identify illegal content
def is_legal_comment(comment_body):
    return all(char in string.printable for char in comment_body)

# Function to fetch comments based on the first post and comment ID
def fetch_comments_from_excel(file_path, start_post_id=None, start_comment_id=None):
    posts_df = pd.read_excel(file_path)
    if start_post_id:
        start_index = posts_df.index[posts_df['id'] == start_post_id].tolist()[0]
    else:
        start_index = 0

    comments_data = []
    batch_number = 1  # Initialize batch number
    for index, row in enumerate(posts_df.iloc[start_index:].iterrows()):
        _, row = row
        post_id = row['id']

        submission = reddit.submission(id=post_id)
        submission.comments.replace_more(limit=None)
        for comment in submission.comments.list():
            if start_comment_id and comment.id == start_comment_id:
                start_comment_id = None

            if start_comment_id:
                continue

            if not is_legal_comment(comment.body):
                continue

            comments_data.append({
                'post_id': post_id,
                'comment_id': comment.id,
                'comment_body': comment.body,
                'comment_author': comment.author.name if comment.author else 'Deleted',
                'comment_score': comment.score,
                'created_utc': datetime.utcfromtimestamp(comment.created_utc)
            })
            time.sleep(1.2)

            if len(comments_data) >= 900:
                save_to_excel(comments_data, batch_number)
                comments_data = []
                batch_number += 1

                time.sleep(60)

    if comments_data:
        save_to_excel(comments_data, batch_number)

In [None]:
file_path = file_path

start_post_id = '1arxh37'
start_comment_id = 'kqni8h5'
fetch_comments_from_excel(file_path, start_post_id, start_comment_id)

## X

In [None]:
import tweepy
from tweepy import Client

In [None]:
client = tweepy.Client(
    bearer_token=bearer_token,
    consumer_key=consumer_key,
    consumer_secret=consumer_secret,
    access_token=access_token,
    access_token_secret=access_token_secret)

### Posts

In [None]:
# Function to fetch the posts
def fetch_tweets(query, max_results=100):

    query += " lang:en -is:retweet"
    tweet_fields = ["attachments", "author_id", "conversation_id", "created_at", "id", "in_reply_to_user_id", "text"]
    tweets_list = []

    try:
        for tweet in tweepy.Paginator(client.search_recent_tweets, query=query,
                                      tweet_fields=','.join(tweet_fields),
                                      sort_order="relevancy",
                                      max_results=min(max_results, 100)).flatten(limit=max_results):
            tweets_list.append(tweet)

            # Sleep to respect rate limit (60 requests per 15 mins -> 1 request per 15 seconds)
            time.sleep(15)

    except tweepy.TweepyException as e:
        print(f"An error occurred: {e}")

    return tweets_list

In [None]:
# Function to save the retrieved posts
def save_tweets_to_excel(tweets, file_path, search_phrase):
    if tweets:
        tweets_data = [{
            "Conversation ID": str(tweet.conversation_id),
            "Tweet ID": str(tweet.id),
            "Author ID": str(tweet.author_id),
            "Created At": tweet.created_at.strftime('%Y-%m-%d %H:%M:%S') if tweet.created_at else 'N/A',
            "Text": tweet.text,
            "In reply to": str(tweet.in_reply_to_user_id)

        } for tweet in tweets]


        tweets_df = pd.DataFrame(tweets_data)

        timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
        sanitized_search_phrase = ''.join(char for char in search_phrase if char.isalnum() or char in "._-")
        filename = f"{file_path}/Tweets_{sanitized_search_phrase}_{timestamp}.xlsx"
        filename = ''.join(char for char in filename if char.isalnum() or char in "._-/\\ ").rstrip()

        with pd.ExcelWriter(filename, engine='openpyxl') as writer:
            tweets_df.to_excel(writer, sheet_name='Tweets', index=False)

        return tweets_df

        print(f"Saved {len(tweets)} tweets to {filename}")
    else:
        print("No tweets to save.")

In [None]:
query = "artificial intelligence"
file_path = file_path

tweets = fetch_tweets(query)
save_tweets_to_excel(tweets, file_path, query)

### Comments

In [2]:
# Function reads an Excel file for Conversation IDs, fetches up to 1000 comments in total for these IDs
def fetch_comments(input_file_path, output_file_path):

    df = pd.read_excel(input_file_path)
    all_comments = []
    max_comments = 900

    for conversation_id in df['Conversation ID'].iloc[7:]:
        if len(all_comments) < max_comments:

            query = f'conversation_id:{conversation_id}'
            try:
                for tweet in tweepy.Paginator(client.search_recent_tweets, query=query,
                                              tweet_fields=["conversation_id", "author_id", "created_at", "id", "text", "in_reply_to_user_id"],
                                              max_results=100).flatten():
                    if len(all_comments) >= max_comments:
                        break
                    all_comments.append(tweet)

                time.sleep(12)

            except tweepy.TweepyException as e:
                print(f"An error occurred: {e}")
                break
        else:
            break


    if all_comments:
        save_tweets_to_excel(all_comments, output_file_path, "comments_climatechange_3")

    else:
        print("No comments were fetched.")

In [None]:
input_file_path = input_file_path
output_file_path = output_file_path

In [None]:
fetch_comments(input_file_path, output_file_path)

## TikTok

In [None]:
!pip install xlsxwriter

In [None]:
import requests
import xlsxwriter
from datetime import datetime, timedelta

In [None]:
"""
  Generating Client Access Token

"""

### Videos

In [None]:
# Function fetches videos on TikTok based on the criteria
def fetch_videos(search_phrase1, search_phrase2, access_token, max_videos=500):
    url = 'https://open.tiktokapis.com/v2/research/video/query/'
    headers = {
        'Authorization': f'Bearer {access_token}',
        'Content-Type': 'application/json'
    }

    query_params = {
      "fields": "id,video_description,create_time,region_code,share_count,view_count,like_count,comment_count,hashtag_names,username,voice_to_text"
    }

    payload = {
        "query": {
        "and":[
          {
            "operation":"IN",
            "field_name":"region_code",
            "field_values":["US"]
          },
          {
            "operation": "EQ",
            "field_name": "hashtag_name",
            "field_values": [search_phrase1]

          },
          {
            "operation": "EQ",
            "field_name": "hashtag_name",
            "field_values": [search_phrase2]

          }
        ]
        },
        "start_date": "20240101",
        "end_date": "20240130",
        "max_count": 100,
    }

    videos = []
    cursor = None

    while len(videos) < max_videos:
        if cursor:
            payload['cursor'] = cursor

        response = requests.post(url, headers=headers, params=query_params, json=payload)
        if response.status_code == 200:
            response_json = response.json()
            video_data = response_json.get('data', {}).get('videos', [])
            cursor = response_json.get('data', {}).get('next_cursor')

            for video in video_data:
                if isinstance(video, dict):
                    video_id = video.get('id')
                    username = video.get('username', 'unknown')
                    video_url = f"https://www.tiktok.com/@{username}/video/{video_id}"
                    video['Video URL'] = video_url
                    videos.append(video)

            if not video_data or not cursor:
                break
        else:
            print(f"Failed to fetch videos: {response.text}")
            break

        if len(videos) >= max_videos:
            videos = videos[:max_videos]

    return videos

In [None]:
def save_to_excel(videos, file_path):
  if videos:
    videos_df = pd.DataFrame(videos)

    filename = f"{file_path}/TikTok_Videos_{search_phrase1}_{datetime.now().strftime('%Y%m%d%H%M%S')}.xlsx"

    with pd.ExcelWriter(filename, engine='openpyxl') as writer:
        videos_df.to_excel(writer, sheet_name='Videos', index=False)

    print(f"Saved {len(videos)} videos to {filename}")

In [None]:
search_phrase1 = "ai"
search_phrase2 = "conspiracy"

In [None]:
file_path = file_path

videos = fetch_videos(search_phrase1, search_phrase2, access_token)

save_to_excel(videos, file_path)