In [None]:
# First cell: Import libraries and load data
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np

# Load the data
df = pd.read_csv('data.csv')

# Display the first few rows and basic info about the dataset
print(df.head())
print(df.info())

# Second cell: Calculate scores
# Calculate International Outlook score (average of International Faculty and International Students)
df['International Outlook'] = (df['International Faculty'] + df['International Students']) / 2

# Calculate Academic Performance score (average of Academic Reputation and Citations per Faculty)
df['Academic Performance'] = (df['Academic Reputation'] + df['Citations per Faculty']) / 2

# Display the new columns
print(df[['Institution Name', 'International Outlook', 'Academic Performance']].head())

# Third cell: Create scatter plot
plt.figure(figsize=(12, 8))
sns.scatterplot(x='International Outlook', y='Academic Performance', data=df)
plt.title('International Outlook vs Academic Performance')
plt.xlabel('International Outlook Score')
plt.ylabel('Academic Performance Score')

# Add a trend line
z = np.polyfit(df['International Outlook'], df['Academic Performance'], 1)
p = np.poly1d(z)
plt.plot(df['International Outlook'], p(df['International Outlook']), "r--")

# Calculate correlation coefficient
correlation = df['International Outlook'].corr(df['Academic Performance'])
plt.annotate(f'Correlation: {correlation:.2f}', xy=(0.05, 0.95), xycoords='axes fraction')

plt.tight_layout()
plt.show()

# Fourth cell: Perform statistical test
# Perform a statistical test (e.g., Pearson correlation)
r, p_value = stats.pearsonr(df['International Outlook'], df['Academic Performance'])

print(f"Pearson correlation coefficient: {r}")
print(f"P-value: {p_value}")

# Fifth cell: Identify outliers
# Identify and label outliers
df['Outlier'] = (
    (df['International Outlook'] > df['International Outlook'].mean() + 2*df['International Outlook'].std()) |
    (df['Academic Performance'] > df['Academic Performance'].mean() + 2*df['Academic Performance'].std())
)

outliers = df[df['Outlier']]
print("Outliers:")
print(outliers[['Institution Name', 'International Outlook', 'Academic Performance']])

# Sixth cell: Visualize outliers
plt.figure(figsize=(12, 8))
sns.scatterplot(x='International Outlook', y='Academic Performance', data=df, hue='Outlier')
plt.title('International Outlook vs Academic Performance (with Outliers)')
plt.xlabel('International Outlook Score')
plt.ylabel('Academic Performance Score')

# Label outliers
for idx, row in outliers.iterrows():
    plt.annotate(row['Institution Name'], (row['International Outlook'], row['Academic Performance']))

plt.tight_layout()
plt.show()

## Interpretation

##The scatter plot shows a [positive/negative/weak/strong] correlation between International Outlook and Academic Performance. The Pearson correlation coefficient of [value] indicates a [weak/moderate/strong] relationship between these two variables.

##The p-value of [value] suggests that this relationship is [statistically significant/not statistically significant] at the 0.05 level.

##Notable outliers include [list some interesting outliers]. These universities might be worth investigating further to understand why they deviate from the general trend.

##These results suggest that [your interpretation of what the data means in the context of your research question].