![reddit banner](https://cdn.dribbble.com/users/1761084/screenshots/3587716/reddit.gtif)

In [20]:
# Importing important libraries
import praw
import pandas as pd
import configparser
import datetime as dt

In [8]:
# For reading configuration files for Reddit Credentials
config = configparser.ConfigParser()
config.read('reddit_credentials.ini')

# Storing credential info in local variables
user_agent = config.get('credentials', 'user_agent')
client_id = config.get('credentials', 'client_id')
client_secret = config.get('credentials', 'client_secret')
redirect_url = config.get('credentials', 'redirect_url')

In [9]:
# Creating read-only Reddit instance
reddit = praw.Reddit(user_agent = user_agent,
                    client_id = client_id,
                    client_secret = client_secret,
                    redirect_url = redirect_url)

## Extracting Comments
For our project we are going to use top 3 most popular Reddit communities -
* Machine Learning - [r/MachineLearning](https://www.reddit.com/r/MachineLearning/)
* Artificial Intelligence - [r/artificial](https://www.reddit.com/r/Artificial/)
* Data Science - [r/DataScience](https://www.reddit.com/r/DataScience/)

We will extract top 1000 post of all time from each sub-reddit to create our dataset along with some other useful information like Post URL (& ID), User posted, Post title, Flair, Number of Comments, Time Created, Upvote Ratio and Score.

In [10]:
# Extracting top 1000 posts from each subreddit
posts = reddit.subreddit('MachineLearning+artificial+datascience').top(time_filter = 'all', limit = 3000)

In [11]:
# Creting DataFrame of the top posts along with other attributes for analysis

posts_list = []

for post in posts:
    posts_list.append({
        'post_id' : post.id,
        'post_title' : post.title,
        'subreddit' : post.subreddit,
        'time_created' : post.created_utc,
        'post_url' : post.url,
        'flair_text' : post.link_flair_text,
        'score' : post.score,
        'comments' : post.num_comments,
        'upvote_ratio' : post.upvote_ratio
    })
    
posts_df = pd.DataFrame(posts_list)

In [24]:
# Converting UTC Date format to Standard Date-Time format
posts_df['date-time'] = posts_df['time_created'].apply(lambda x: dt.datetime.fromtimestamp(x))

# Creating 'Year' column
posts_df['year'] = posts_df['date-time'].dt.year

# Dropping 'time_created' column
posts_df.drop('time_created', axis = 1, inplace = True)

In [25]:
posts_df

Unnamed: 0,post_id,post_title,subreddit,post_url,flair_text,score,comments,upvote_ratio,date-time,year
0,gh1dj9,[Project] From books to presentations in 10s w...,MachineLearning,https://v.redd.it/v492uoheuxx41,Project,7798,186,0.99,2020-05-10 13:19:54,2020
1,kuc6tz,[D] A Demo from 1993 of 32-year-old Yann LeCun...,MachineLearning,https://v.redd.it/25nxi9ojfha61,Discussion,5851,133,0.98,2021-01-10 10:30:36,2021
2,g7nfvb,[R] First Order Motion Model applied to animat...,MachineLearning,https://v.redd.it/rlmmjm1q5wu41,Research,4761,111,0.97,2020-04-25 04:27:23,2020
3,lui92h,[N] AI can turn old photos into moving Images ...,MachineLearning,https://v.redd.it/ikd5gjlbi8k61,News,4688,230,0.97,2021-02-28 15:12:28,2021
4,ohxnts,[D] This AI reveals how much time politicians ...,MachineLearning,https://i.redd.it/34sgziebfia71.jpg,Discussion,4568,228,0.96,2021-07-11 04:18:59,2021
...,...,...,...,...,...,...,...,...,...,...
2982,slx33m,We live in beautiful times where you can learn...,artificial,https://github.com/louisfb01/start-machine-lea...,Discussion,84,6,0.90,2022-02-06 13:50:02,2022
2983,k9otbj,Yann LeCun’s Deep Learning Course Free From NYU,artificial,https://www.i-programmer.info/news/99-professi...,News,78,1,0.97,2020-12-09 09:22:52,2020
2984,k2orib,You Can Now Learn for FREE: 9 Courses by Googl...,artificial,https://laconicml.com/free-artificial-intellig...,Self Promotion,80,2,0.95,2020-11-28 14:43:43,2020
2985,ex9w4w,"Chatbot trained on ""public domain social media...",artificial,https://ai.googleblog.com/2020/01/towards-conv...,news,80,10,0.97,2020-02-01 17:55:23,2020


In [12]:
# Saving our posts data in .csv format
posts_df.to_csv("Top_Posts.csv", header = True, index = False)

In [13]:
# Displaying the content of saved Post Data
posts_df = pd.read_csv('Top_Posts.csv')
posts_df.sample(10)

Unnamed: 0,post_id,post_title,subreddit,time_created,post_url,flair_text,score,comments,upvote_ratio
2562,8h1saq,Facebook is using billions of Instagram images...,artificial,1525461000.0,https://www.theverge.com/2018/5/2/17311808/fac...,,113,24,0.94
2851,10cfef6,Inpainting with the Visuali editor (beta),artificial,1673774000.0,https://v.redd.it/4ncbg5mgv3ca1,Research,88,6,0.96
960,117bptb,PyGWalker: Turn your Pandas Dataframe into a T...,datascience,1676910000.0,https://www.reddit.com/r/datascience/comments/...,Projects,472,47,0.99
2701,5shha6,Fear at the top: The CEO of Google DeepMind is...,artificial,1486421000.0,http://www.businessinsider.com/google-deepmind...,,98,34,0.9
2432,btgj82,AI Trained on 100 Million Opinions Can Predict...,artificial,1558924000.0,https://blog.photofeeler.com/photofeeler-d3/,,132,27,0.99
2916,zvhy5w,PaLM vs. ChatGPT: Who Will Win the AI Race?,artificial,1672040000.0,https://medium.com/inkwater-atlas/palm-vs-chat...,Self Promotion,87,3,0.93
1545,i2bvrr,[P] Open RL Benchmark @ 0.3.0 (benchmark.clean...,MachineLearning,1596374000.0,https://v.redd.it/80lthq5cale51,Project,350,14,0.96
2532,hgttkm,This AI translates code from a programming lan...,artificial,1593266000.0,https://youtu.be/u6kM2lkrGQk,News,113,12,0.97
2155,j0m182,Jump Rope + AI. Keeping both on point! Made th...,artificial,1601187000.0,https://v.redd.it/5fr03wigsmp51,My project,214,11,0.95
2787,8d3zuy,Scientists develop artificial intelligence sys...,artificial,1524039000.0,https://www.financialexpress.com/lifestyle/sci...,,91,2,0.97


We will use the 'post_id' to further extract the comments from the Top Posts.

In [None]:
# Creating DataFrame of all the comments available in the Top Posts

comments_list = []

for post_id in posts_df['post_id']:
    submission = reddit.submission(post_id)
    submission.comments.replace_more(limit = None)
    
    for comment in submission.comments.list():
        comments_list.append({
            'post_id' : post_id,
            'comment' : comment.body
        })
        
comments_df = pd.DataFrame(comments_list)

In [None]:
# Saving our comments data in .csv format
comments_df.to_csv('Top_Posts_Comments.csv', header = True, index = False)

In [15]:
# Displaying the content of our Comments Data
comments_df = pd.read_csv('Top_Posts_Comments.csv')
comments_df.sample(10)

Unnamed: 0,post_id,comment
61970,11w03sy,!remindme one week
146623,2lmo0l,"Hello Dr Hinton, Im doing a case study in my c..."
96224,r76igz,Transformers robots in disguise
62757,ulvdgm,"Love your work, scared of your name, uncertain..."
185205,kf2j1l,What does dagster bring to airflow that airflo...
200955,riup34,Great comment! I'm a hybrid data engineer/data...
2316,hohvgq,Average DS guy from a business undergrad. Don’...
169021,b3zlha,gpt-2 finish this\n\n
135045,65ukie,You obvoiusly need to search better in the lat...
214825,bl6gbm,[deleted]


In [19]:
print("Shape of Posts Data - {}".format(posts_df.shape))
print("Shape of Comments Data - {}".format(comments_df.shape))

Shape of Posts Data - (2987, 9)
Shape of Comments Data - (223174, 2)
