# **EDA ON THE DATABASE**

Importing packages and modules

In [1]:
import pymongo 
import matplotlib.pyplot as plt 
import pandas as pd
from datetime import datetime
import plotly.express as px

**Giving Mongo Database Connection**

In [2]:
mongo_uri = "mongodb://localhost:27017/"

**Different Socio-Economic Causes In Our Database**

In [3]:

db_name = "summarized_clean"          

# Connecting to MongoDB
client = pymongo.MongoClient(mongo_uri)
db = client[db_name]

# Getting collection names and record counts
collection_names = db.list_collection_names()
collection_data = []

for collection_name in collection_names:
    collection = db[collection_name]
    record_count = collection.count_documents({})
    collection_data.append({"Collection": collection_name, "Count": record_count})

# Creating a tree map visualization
fig = px.treemap(collection_data, path=["Collection"],  title="Different Socio-Economic Causes In Our Database")
fig.update_traces(textinfo="label")
fig.show()


**Count Of Articles For Different Causes**

In [4]:
database_name = "summarized_clean"
# Connecting to the MongoDB server and selecting my database
client = pymongo.MongoClient(mongo_uri)
db = client[database_name]

# Getting a list of collection names in my database
collection_names = db.list_collection_names()

# Fetching the collection sizes and storing in a dictionary
collection_sizes = {}
for collection_name in collection_names:
    collection = db[collection_name]
    collection_sizes[collection_name] = collection.count_documents({})

# Creating a bar chart using Plotly
fig = px.bar(
    x=list(collection_sizes.keys()), 
    y=list(collection_sizes.values()),
)
fig.update_layout(
    
    xaxis_title="Socio-Economic Topic",
    yaxis_title="Number of Articles Collected",
    title="Number of Articles in Each Socio-Economic Topic",
    xaxis_tickangle=-45,
)
fig.show()

**Monthly Distribution Of Articles For Each Cause**

In [6]:
database_name = "articles_6_months"

# Connecting to the MongoDB server and selecting my database
client = pymongo.MongoClient(mongo_uri)
db = client[database_name]

# Getting a list of collection names
collection_names = db.list_collection_names()

# Asking the user for the collection name using Plotly input
print("Available Socio-Economic Topics:")
for idx, name in enumerate(collection_names):
    print(f"{idx + 1}. {name}")

selected_collection_index = -1  # Initialize to an invalid value

while selected_collection_index < 0 or selected_collection_index >= len(collection_names):
    try:
        selected_collection_index = int(input("Enter the number of the collection you want to visualize: ")) - 1

        if selected_collection_index < 0 or selected_collection_index >= len(collection_names):
            print("Invalid collection selection. Please enter a valid number.")
        else:
            selected_collection_name = collection_names[selected_collection_index]
            break  # Exit the loop if a valid input is provided

    except ValueError:
        print("Invalid input. Please enter a valid number.")

# Querying the selected MongoDB collection and converting date strings to datetime objects
collection = db[selected_collection_name]
cursor = collection.find({}, {"published_date": 1})
date_list = [doc["published_date"] for doc in cursor]
date_list = [datetime.strptime(date, "%a, %d %b %Y %H:%M:%S GMT") for date in date_list]

# Creating a DataFrame with the date information
import pandas as pd
df = pd.DataFrame(date_list, columns=["date"])
df['month_year'] = df['date'].dt.to_period('M')

# Counting the number of articles per month
article_counts = df['month_year'].value_counts()

custom_category_order = ["Jan 2023", "Feb 2023", "Mar 2023", "Apr 2023", "May 2023", "Jun 2023", "Jul 2023"]

# Creating a bar chart using Plotly
fig = px.bar(
    x=article_counts.index.strftime('%b %Y'),
    y=article_counts.values,
    title=f"Monthly Distribution of  Articles in {selected_collection_name}",
    labels={"x": "Month-Year", "y": f"Number of Articles for {selected_collection_name}"},
    category_orders={"x": custom_category_order}, 
)
fig.update_xaxes(tickangle=-45, tickfont=dict(size=10))
fig.show()

Available Socio-Economic Topics:
1. Social Mobility
2. Gender Pay Gap
3. Wealth Distribution
4. Minimum Wage
5. Labor Rights
6. Economic Development
7. Income Inequality
8. Racial Discrimination
9. Economic Empowerment
10. Poverty
11. Education Access
12. Access to Resources
13. Healthcare Access
14. Wealth Gap
15. Food Security
16. Rape
17. Housing Crisis
