In [None]:
import pandas as pd
import json

# Load the data
data = pd.read_excel('/content/Combined_Data.xlsx')

# Function to convert JSON strings to dictionaries and flatten them
def extract_json_objects(json_str):
    try:
        # Load the JSON content
        json_data = json.loads(json_str.replace('\'', '\"'))
        # Flatten the JSON data
        return pd.json_normalize(json_data)
    except:
        # Return None if JSON is empty or invalid
        return pd.DataFrame()

# Initialize an empty DataFrame to store the expanded JSON data
expanded_data = pd.DataFrame()

# Iterate through each row and process the JSON content
for index, row in data.iterrows():
    row_data = extract_json_objects(row['keywords'])
    if not row_data.empty:
        # Add a new column for the index to merge on
        row_data['original_index'] = index
        # Append to the expanded DataFrame
        expanded_data = pd.concat([expanded_data, row_data], ignore_index=True)

# Merge the expanded data with the original DataFrame
result = pd.merge(data, expanded_data, how='left', left_index=True, right_on='original_index')

# Drop the auxiliary 'original_index' column
result.drop('original_index', axis=1, inplace=True)

# Display the first few rows of the resulting DataFrame
print(result.head())


                                              keywords        name  \
0.0  [{'name': 'glocations', 'value': 'iran', 'rank...  glocations   
1.0  [{'name': 'glocations', 'value': 'iran', 'rank...     persons   
2.0  [{'name': 'glocations', 'value': 'iran', 'rank...     subject   
3.0  [{'name': 'glocations', 'value': 'iran', 'rank...     subject   
4.0  [{'name': 'glocations', 'value': 'iran', 'rank...     subject   

                                     value  rank major  
0.0                                   iran   1.0     n  
1.0                   hassani, gholam-reza   2.0     n  
2.0  united states international relations   3.0     n  
3.0                              forecasts   4.0     n  
4.0                                  islam   5.0     n  


In [1]:
import pandas as pd
import json
from google.colab import files  # Importing the files module

# Load the data
data = pd.read_excel('/content/Combined_Data.xlsx')

# Function to convert JSON strings to dictionaries and flatten them
def extract_json_objects(json_str):
    try:
        # Load the JSON content
        json_data = json.loads(json_str.replace('\'', '\"'))
        # Flatten the JSON data
        return pd.json_normalize(json_data)
    except:
        # Return None if JSON is empty or invalid
        return pd.DataFrame()

# Initialize an empty DataFrame to store the expanded JSON data
expanded_data = pd.DataFrame()

# Iterate through each row and process the JSON content
for index, row in data.iterrows():
    row_data = extract_json_objects(row['keywords'])
    if not row_data.empty:
        # Add a new column for the index to merge on
        row_data['original_index'] = index
        # Append to the expanded DataFrame
        expanded_data = pd.concat([expanded_data, row_data], ignore_index=True)

# Merge the expanded data with the original DataFrame
result = pd.merge(data, expanded_data, how='left', left_index=True, right_on='original_index')

# Drop the auxiliary 'original_index' column
result.drop('original_index', axis=1, inplace=True)

# Save the resulting DataFrame as a CSV file
result.to_csv('/content/result.csv', index=False)

# Download the CSV file from Colab
files.download('/content/result.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from textblob import TextBlob

# Load the dataset
file_path = '/content/result.csv'
data = pd.read_csv(file_path)

# Add numerical features: word count and sentiment score
data['word_count'] = data['cleaned_value_no_stopwords'].apply(lambda x: len(x.split()))
data['sentiment_score'] = data['sentiment'].map({'positive': 1, 'neutral': 0, 'negative': -1})

# Correlation analysis
correlation_matrix = data[['word_count', 'sentiment_score']].corr()
print("Correlation Matrix:")
print(correlation_matrix)

# Hypothesis testing: compare word counts between positive and neutral sentiments
positive_word_counts = data[data['sentiment'] == 'positive']['word_count']
neutral_word_counts = data[data['sentiment'] == 'neutral']['word_count']

# Perform t-test
t_stat, p_value = stats.ttest_ind(positive_word_counts, neutral_word_counts, equal_var=False)
print(f"T-statistic: {t_stat}, P-value: {p_value}")

# Interpret the hypothesis test
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis - Significant differences exist between the groups.")
else:
    print("Fail to reject the null hypothesis - No significant difference between the groups.")


              cleaned_value_no_stopwords sentiment
0  united states international relations   neutral
1                              forecasts   neutral
2                                  islam   neutral
3  united states international relations   neutral
4                              forecasts   neutral


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load the dataset
file_path = '/content/labeled_politically_relevant_data.csv'
politically_relevant_data = pd.read_csv(file_path)

# Convert text data into TF-IDF features and prepare labels
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Using 1000 features for this example
X = tfidf_vectorizer.fit_transform(politically_relevant_data['cleaned_value_no_stopwords'])
y = politically_relevant_data['sentiment']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model on the training data
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

# Predict the sentiment labels for the test data
y_pred = logistic_model.predict(X_test)

# Generate a classification report to evaluate the model's performance
classification_results = classification_report(y_test, y_pred)
print(classification_results)


              precision    recall  f1-score   support

    negative       1.00      0.98      0.99      1780
     neutral       1.00      1.00      1.00     38062
    positive       1.00      0.98      0.99      1206

    accuracy                           1.00     41048
   macro avg       1.00      0.98      0.99     41048
weighted avg       1.00      1.00      1.00     41048



In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from textblob import TextBlob

# Load the dataset from the specified file path
file_path = '/content/labeled_politically_relevant_data.csv'
data = pd.read_csv(file_path)

# Calculate numerical features: word count and sentiment score
data['word_count'] = data['cleaned_value_no_stopwords'].apply(lambda x: len(x.split()))
data['sentiment_score'] = data['sentiment'].map({'positive': 1, 'neutral': 0, 'negative': -1})

# Perform correlation analysis between word count and sentiment score
correlation_matrix = data[['word_count', 'sentiment_score']].corr()
print("Correlation Matrix:")
print(correlation_matrix)

# Hypothesis testing: compare word counts between positive and neutral sentiments
positive_word_counts = data[data['sentiment'] == 'positive']['word_count']
neutral_word_counts = data[data['sentiment'] == 'neutral']['word_count']

# Conduct a t-test to compare the means of word counts in positive and neutral sentiment groups
# Null Hypothesis (H0): There is no difference in the mean word counts between positive and neutral sentiments
# Alternative Hypothesis (H1): There is a significant difference in the mean word counts between positive and neutral sentiments
t_stat, p_value = stats.ttest_ind(positive_word_counts, neutral_word_counts, equal_var=False)
print(f"T-statistic: {t_stat}, P-value: {p_value}")

# Interpret the results of the hypothesis test
alpha = 0.05
if p_value < alpha:
    # Reject the null hypothesis if the p-value is less than alpha
    print("Reject the null hypothesis - Significant differences exist between the groups.")
else:
    # Fail to reject the null hypothesis if the p-value is greater than or equal to alpha
    print("Fail to reject the null hypothesis - No significant difference between the groups.")


Correlation Matrix:
                 word_count  sentiment_score
word_count         1.000000        -0.069988
sentiment_score   -0.069988         1.000000
T-statistic: 24.229384639246998, P-value: 3.2446094418305666e-124
Reject the null hypothesis - Significant differences exist between the groups.
