In [132]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.isidewith.com/polls/popular'
response = requests.get(url)

issues_votes = []

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    polls = soup.find_all('div', class_='poll')
    
    for poll in polls:
        #identifying issue name
        img_div = poll.find('div', class_='img')
        if img_div:
            p_tag = img_div.find('p')
            if p_tag:
                span_tag = p_tag.find('span')
                if span_tag:
                    issue_name = span_tag.text.strip()

                    #logic for finding vote count
                    count_div = poll.find('div', class_='count')
                    if count_div:
                        vote_text = count_div.text.strip().split()[0].replace(',', '')
                        vote_count = int(vote_text) if vote_text.isdigit() else 0
                        issues_votes.append((issue_name, vote_count))

    issues_votes.sort(key=lambda x: x[1], reverse=True)



In [133]:
issues = [issue[0] for issue in issues_votes]
for issue in issues:
    print(issue)
print(issues)

Abortion
Gun Control
Immigration Healthcare
Obamacare
Gay Marriage
Marijuana
Minimum Wage
Terrorism
Common Core
Climate Change
Student Loans
Oil Drilling
Voter Fraud
Immigration
Death Penalty
Planned Parenthood Funding
Drug Policy
Border Security
Affirmative Action
Immigrant Laborers
Fracking
Equal Pay
Government Mandates
In-State Tuition
Government Spending
United Nations
Term Limits
Patriot Act
Solitary Confinement for Juveniles
Space Exploration
Medicaid
Religious Freedom Act
Criminal Voting Rights
Farm Subsidies
Alternative Energy
Israel
Mandatory Vaccinations
GMO Labels
Drones
Euthanasia
First Amendment
Immigrant Assimilation
No-Fly List Gun Control
Nuclear Energy
Police Body Cameras
Net Neutrality
Illegal Immigrant Detainment
Pension Reform
Gerrymandering
NSA Surveillance
Labor Unions
Property Taxes
Confederate Flag
Welfare Drug Testing
Border Wall
Drug Price Regulation
Military Spending
Foreign Assassination
Right of Foreigners to Vote
NSA Domestic Surveillance
Mandatory Militar

In [134]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import pandas as pd
import numpy as np

#load dataset
df = pd.read_csv("FixedData.csv")

#apply log transformation to viewCount for normalization
df['LogviewCount'] = np.log1p(df['viewCount'])
df['Logfollowers'] = np.log1p(df['followers'])
df['LoglikeCount'] = np.log1p(df['likeCount'])
df['LogquoteCount'] = np.log1p(df['quoteCount'])
df['LogretweetCount'] = np.log1p(df['retweetCount'])
df['LogreplyCount'] = np.log1p(df['replyCount'])

#extract relevant features
features = df[['LoglikeCount', 'LogreplyCount', 'LogquoteCount', 'LogretweetCount', 'Logfollowers']].values
views = df['LogviewCount'].values.reshape(-1, 1) #reshape viewCount for compatibility

#scale features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

#split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_scaled, views, test_size=0.2, random_state=42)

#polynomialize the features to account for non-linearity
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

#linear Regression with Polynomial Features
linear_reg = LinearRegression()
linear_reg.fit(X_train_poly, y_train)
y_pred_linear = linear_reg.predict(X_test_poly)

#initialize and fit linear regression model
reg = LinearRegression().fit(X_train, y_train)

#extract coefficients as feature importance
feature_weights = reg.coef_[0]

#normalize feature weights to sum to 1 for comparison
normalized_weights = feature_weights / np.sum(np.abs(feature_weights))
normalized_weights = (np.abs(normalized_weights))

#print normalized feature weights
print("Normalized Weights for Features: ", normalized_weights)


'''
train_features = torch.tensor(X_train, dtype=torch.float32)
test_features = torch.tensor(X_test, dtype=torch.float32)
train_views = torch.tensor(y_train, dtype=torch.float32)
test_views = torch.tensor(y_test, dtype=torch.float32)
train_dataset = TensorDataset(train_features, train_views)
test_dataset = TensorDataset(test_features, test_views)

class EngagementMLP(nn.Module):
    def __init__(self):
        super(EngagementMLP, self).__init__()
        self.layer1 = nn.Linear(5, 10) #first linear layer
        self.relu = nn.ReLU() #ReLU activation function
        self.dropout = nn.Dropout(0.5) #dropout to prevent overfitting
        self.layer2 = nn.Linear(10, 8) #second linear layer
        self.output = nn.Linear(8, 1) #output layer

    #define forward pass
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.dropout(x)
        x = self.relu(self.layer2(x))
        x = self.output(x)
        return x

model = EngagementMLP() 
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1)

def train_model(model, train_dataset, test_dataset, epochs=100):
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
    train_losses, test_losses = [], []

    #training loop
    for epoch in range(epochs):
        total_train_loss = 0 #set model to training mode
        model.train()
        for inputs, targets in train_loader: #iterate over batches
            optimizer.zero_grad()
            outputs = model(inputs) #forward pass
            loss = loss_fn(outputs, targets) #calculate loss
            loss.backward() #backpropagate
            optimizer.step() #update weights
            total_train_loss += loss.item() #accumulate loss
        
        #step learning rate scheduler
        scheduler.step()

        #average training loss
        total_train_loss /= len(train_loader)
        train_losses.append(total_train_loss)
        total_test_loss = 0

        #set model to evaluation mode
        model.eval()

        #disable gradient calculation
        with torch.no_grad():
            for inputs, targets in test_loader: #iterate over test batches
                outputs = model(inputs)
                loss = loss_fn(outputs, targets)
                total_test_loss += loss.item()

        #average testing loss
        total_test_loss /= len(test_loader)
        test_losses.append(total_test_loss)
        
        #print training and testing loss
        print(f'Epoch {epoch+1}, Train Loss: {total_train_loss}, Test Loss: {total_test_loss}')

    return train_losses, test_losses

#train model and capture losses
train_losses, test_losses = train_model(model, train_dataset, test_dataset)

#plot training and testing losses
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Train Loss')
plt.plot(test_losses, label='Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Model Loss over Epochs')
plt.legend()
plt.show()

def find_weights(model, inputs):
    model.eval()
    importance_scores = np.zeros((5,)) 
    baseline_output = model(inputs).data.numpy()
    
    with torch.no_grad():
        for feature in range(5): 
            perturbed = inputs.clone()
            perturbed[:, feature] += 0.01
            perturbed_output = model(perturbed).data.numpy()
            importance_scores[feature] = np.mean(
                np.abs(perturbed_output - baseline_output)) #calc the change in output due to the perturbation
    return importance_scores


test_inputs = test_features[:1] 
weights = find_weights(model, test_inputs)
normalized_weights = weights / np.sum(weights)

print("Normalized Weights: ", normalized_weights)'''


Normalized Weights for Features:  [0.46669847 0.00973206 0.26982732 0.21459036 0.03915178]


'\ntrain_features = torch.tensor(X_train, dtype=torch.float32)\ntest_features = torch.tensor(X_test, dtype=torch.float32)\ntrain_views = torch.tensor(y_train, dtype=torch.float32)\ntest_views = torch.tensor(y_test, dtype=torch.float32)\ntrain_dataset = TensorDataset(train_features, train_views)\ntest_dataset = TensorDataset(test_features, test_views)\n\nclass EngagementMLP(nn.Module):\n    def __init__(self):\n        super(EngagementMLP, self).__init__()\n        self.layer1 = nn.Linear(5, 10) #first linear layer\n        self.relu = nn.ReLU() #ReLU activation function\n        self.dropout = nn.Dropout(0.5) #dropout to prevent overfitting\n        self.layer2 = nn.Linear(10, 8) #second linear layer\n        self.output = nn.Linear(8, 1) #output layer\n\n    #define forward pass\n    def forward(self, x):\n        x = self.relu(self.layer1(x))\n        x = self.dropout(x)\n        x = self.relu(self.layer2(x))\n        x = self.output(x)\n        return x\n\nmodel = EngagementMLP() \n

In [135]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

mse = mean_squared_error(y_test, y_pred_linear)
mae = mean_absolute_error(y_test, y_pred_linear)
rmse = np.sqrt(mse)

# Print the evaluation results
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)



Mean Squared Error: 0.520736716875286
Mean Absolute Error: 0.5643510839448919
Root Mean Squared Error: 0.7216208955367673


In [136]:
import matplotlib.pyplot as plt

#list of features to calculate engagement score
features = ['likeCount', 'replyCount', 'quoteCount', 'retweetCount', 'followers']
#calculate engagement score using dot product with normalized weights
df['EngagementScore'] = df[features].dot(normalized_weights.T)

#group by searchTerm and calculate mean of engagement scores
topic_engagement = df.groupby('searchTerm')['EngagementScore'].mean()

#calculate quantiles for thresholding engagement scores
low_threshold, high_threshold = topic_engagement.quantile([0.33, 0.66])

#function to categorize engagement score into Low, Medium, High
def categorize_engagement(score):
    if score <= low_threshold:
        return 'Low'
    elif score <= high_threshold:
        return 'Medium'
    else:
        return 'High'

#print engagement scores by topic again for verification
print("Engagement Scores by Topic:")
print(topic_engagement)


Engagement Scores by Topic:
searchTerm
Abortion                   44310.394538
Affirmative Action          7310.741366
Border Camps               11957.959614
China Taiwan               11917.244601
Climate Change             23545.592510
Critical Race Theory       24852.308744
Euthanasia                 13328.148366
Felon Voting Rights         8856.729830
GMO                         6190.639160
Gay Marriage                3924.755234
Gun Control                 7577.798432
Insulin Prices             39753.425194
Iran Israel                28970.525823
Israel Palestine            6975.534167
Medicaid                   70280.688681
Minimum Wage               19254.423472
Obamacare                   8234.739913
Patriot Act                 4386.935098
Pharmaceutical Industry     9130.127273
Police Violence             9746.670236
Student Loans              48607.498198
Term Limits                29329.682609
Terrorism                  14398.823221
Trans Athletes             74889.105632
T

In [137]:
import pandas as pd
import numpy as np
import plotly.express as px

topic_engagement = df.groupby('searchTerm').agg({
    'EngagementScore': 'sum',
    'viewCount': 'sum' 
}).reset_index()

# Applying logarithmic transformation, ensuring there are no non-positive values
topic_engagement['logEngagementScore'] = np.log1p(topic_engagement['EngagementScore'])
topic_engagement['logViewCount'] = np.log1p(topic_engagement['viewCount'])

# Applying categorization based on thresholds (these should be defined prior or calculated here)
low_threshold, high_threshold = topic_engagement['logEngagementScore'].quantile([0.33, 0.66])
topic_engagement['Category'] = topic_engagement['logEngagementScore'].apply(categorize_engagement)

# Map each category to a color
color_map = {'Low': 'red', 'Medium': 'yellow', 'High': 'green'}

# Create an interactive scatter plot using Plotly
fig = px.scatter(topic_engagement, x='logViewCount', y='logEngagementScore',
                 color='Category', color_discrete_map=color_map,
                 hover_name='searchTerm', title='Engagment Scores Clusted by Thresholds',
                 labels={'logViewCount': 'Log View Count (Mean)', 'logEngagementScore': 'Log Engagement Score (Mean)'})

# Add horizontal lines to indicate quantile thresholds
fig.add_hline(y=low_threshold, line_dash="dash", line_color="red")
fig.add_hline(y=high_threshold, line_dash="dash", line_color="green")

# Show the plot
fig.show()

In [138]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.cluster import KMeans

# Assuming df is your main DataFrame and it includes both 'EngagementScore' and 'viewCount'
# Calculate sum of engagement score and view count per 'searchTerm' and reset index to keep 'searchTerm' in the DataFrame
topic_engagement = df.groupby('searchTerm').agg({
    'EngagementScore': 'mean',
    'viewCount': 'mean'
}).reset_index()

# Applying logarithmic transformation, ensuring there are no non-positive values
topic_engagement['logEngagementScore'] = np.log1p(topic_engagement['EngagementScore'])
topic_engagement['logViewCount'] = np.log1p(topic_engagement['viewCount'])

# Apply K-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)  # You can choose different number of clusters based on your analysis
topic_engagement['Cluster'] = kmeans.fit_predict(topic_engagement[['logViewCount', 'logEngagementScore']])

# Map each cluster to a color
color_map = {0: 'red', 1: 'yellow', 2: 'green'}
topic_engagement['Color'] = topic_engagement['Cluster'].map(color_map)

# Create an interactive scatter plot using Plotly
fig = px.scatter(topic_engagement, x='logViewCount', y='logEngagementScore',
                 color=topic_engagement['Color'],
                 hover_name='searchTerm', title='Engagment Scores Kmeans Clustered',
                 labels={'logViewCount': 'log(Mean View Count)', 'logEngagementScore': 'Log(Mean Engagement Score)'})

# Hide the legend
fig.update_layout(showlegend=False)

# Show the plot
fig.show()
