In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.tree import DecisionTreeClassifier, plot_tree
import statsmodels.api as sm
from scipy.stats import chi2_contingency
from dython.nominal import associations
# Read in data from CSV
df = pd.read_csv('file.csv')

# Basic DataFrame operations
df.head()           # View first 5 rows
df.tail()           # View last 5 rows
df.describe()       # Summary statistics for numerical columns
df.info()           # Info on column data types and missing values
df.shape            # Get number of rows and columns


In [None]:
data = pd.read_excel('Data_Analytics_Take_Home_HL_repaired.xlsx')

In [None]:
# Drop columns
df.drop(['Column1', 'Column2'], axis=1, inplace=True)

# Rename columns
df.rename(columns={'OldName': 'NewName'}, inplace=True)

# Check for and sum missing values
df.isnull().sum()

# Fill missing values
df['Column'].fillna(df['Column'].mean(), inplace=True)  # Fill with mean
df['Column'].fillna(df['Column'].median(), inplace=True)  # Fill with median
df['Column'].fillna('Value', inplace=True)  # Fill with specific value

# Drop missing values
df.dropna(inplace=True)

# Convert data types
df['Column'] = df['Column'].astype('int')
df['Column'] = df['Column'].astype('float')
df['Column'] = df['Column'].astype('category')

# String operations
df['Column'] = df['Column'].str.lower()  # Convert to lower case
df['Column'] = df['Column'].str.upper()  # Convert to upper case
df['Column'] = df['Column'].str.strip()  # Remove whitespace

# Datetime operations
df['DateColumn'] = pd.to_datetime(df['DateColumn'])
df['Year'] = df['DateColumn'].dt.year
df['Month'] = df['DateColumn'].dt.month
df['Day'] = df['DateColumn'].dt.day


In [None]:
# Apply functions
df['Column'] = df['Column'].apply(lambda x: x+1)

# Operations on columns
df['NewColumn'] = df['Column1'] + df['Column2']
df['NewColumn'] = df['Column1'] * df['Column2']

# Aggregations
df.groupby('Column').sum()
df.groupby('Column').mean()
df.groupby('Column').agg({'Column2': 'sum', 'Column3': 'mean'})

# Sorting
df.sort_values(by='Column', ascending=False, inplace=True)

# Reset index
df.reset_index(drop=True, inplace=True)


In [None]:
# Basic plot
plt.plot(df['Column1'], df['Column2'])
plt.title('Title')
plt.xlabel('X-axis Label')
plt.ylabel('Y-axis Label')
plt.show()

# Histogram
df['Column'].hist(bins=50)
plt.show()

# Scatter plot
plt.scatter(df['Column1'], df['Column2'])
plt.show()

# Boxplot
df.boxplot(column=['Column1', 'Column2'])
plt.show()


grouped_counts = data.groupby(['page_category', 'page_topic_description']).size().unstack()


In [None]:
# Get dummy variables for categorical columns
df_encoded = pd.get_dummies(df, columns=['CategoricalColumn'], drop_first=True)


In [None]:
def calculate_conversion_rate(data, column_name):
    conversions = data[data['Conversion'] == 1].groupby(['Asset Shown', column_name])['Conversion'].count()
    
    total_shown = data.groupby(['Asset Shown', column_name])['Conversion'].count()
    
    conversion_rate = (conversions / total_shown)
    
    sorted_conversion_rate = conversion_rate.sort_values(ascending=False)
    print(sorted_conversion_rate)
    print('\n')

    
def calculate_revenue(data, column_name):
    converted_data = data[data['Conversion'] == 1]
    asset_revenues = np.array([5, 7, 2.5])
    conversions_table = converted_data.groupby([column_name, 'Asset Shown']).size().reset_index(name='counts')
    conversions = conversions_table['counts'].to_numpy()

    repeated_revenues = np.tile(asset_revenues, int(np.ceil(len(conversions)/len(asset_revenues))))[:len(conversions)]
    conversions_table['Revenue'] = conversions * repeated_revenues

    revenue_by_column = conversions_table.groupby(column_name)['Revenue'].sum().reset_index().sort_values(by='Revenue', ascending=False)
    return revenue_by_column, conversions_table

In [None]:
def reshuffle_data(data):
    majority_class = data[data['Conversion'] == 0]
    minority_class = data[data['Conversion'] == 1]

    # Undersample the rows of no conversion
    majority_class_undersampled = majority_class.sample(len(minority_class), random_state=13)

    # Concatenate the randomly sampled rows
    balanced_data = pd.concat([majority_class_undersampled, minority_class], axis=0)

    # shuffle the rows
    balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)
    return balanced_data

In [None]:
predictor_constant = sm.add_constant(data['time_difference'])
model = sm.Logit(data['Conversion'], predictor_constant).fit()
print(model.summary())

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder

# Load the data
data = pd.read_csv('data.csv')

# Preprocess and clean the data
# ... (handle missing values, correct data types, etc.)

# One-hot encoding
data = pd.get_dummies(data, columns=['Known Diagnosis', 'Page Category', 'Device Type', 'Page Topic Description', 'Asset Shown'])

# Exploratory Data Analysis (EDA)
# ... (calculate average conversion rates, visualize distributions, etc.)

# Prepare the data for modeling
X = data.drop(['Conversion', 'Revenue', 'Other non-predictive columns'], axis=1)
y = data['Conversion']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict probabilities
probabilities = model.predict_proba(X_test)[:, 1]

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, probabilities)

# Calculate expected revenue for each combination
data['Expected Revenue'] = probabilities * data['Revenue per Conversion']

# Find the best asset for each user/page combination
data['Best Asset'] = data[['Expected Revenue A', 'Expected Revenue B', 'Expected Revenue C']].idxmax(axis=1)

# Report on findings
# ... (summarize variable importance, lucrative combinations, etc.)

# Calculate incremental revenue
# ... (estimate potential revenue gains from optimal asset matching)
