In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Step 1: Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Step 2: Load the dataset (based on the image path and structure)
file_path = 'D:\\Matrix Solution\\BrainWave-Matrix-Solution-intern\\DataSet-Sales\\Adidas US Sales Datasets.xlsx'
df = pd.read_excel(file_path, header=4, index_col=0)

# Step 3: Drop the first column with all missing values (already identified)
df = df.drop(df.columns[0], axis=1)

# Step 4: Basic Info and Date Handling
df['Invoice Date'] = pd.to_datetime(df['Invoice Date'])
df['Month'] = df['Invoice Date'].dt.to_period('M')
df['Year'] = df['Invoice Date'].dt.year

# Step 5: Check for Data Validity (Is Total Sales = Price per Unit * Units Sold?)
df['Calculated Sales'] = df['Price per Unit'] * df['Units Sold']
sales_discrepancy = (df['Total Sales'] - df['Calculated Sales']).abs().sum()

# Step 6: Overview Statistics
overview = {
    "Shape": df.shape,
    "Missing Values": df.isnull().sum(),
    "Data Types": df.dtypes,
    "Sales Discrepancy Sum": sales_discrepancy
}

# Step 7: Save Cleaned Data for Future Use
cleaned_file_path = "adidas_sales_cleaned.csv"
df.to_csv(cleaned_file_path, index=False)

overview


In [None]:
'''
Advanced Statistical Analysis Template
Dataset: Adidas US Sales Data (cleaned DataFrame `df`)
''' 
import pandas as pd
import numpy as np
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.api import VAR
from pygam import LinearGAM, s
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from lifelines import KaplanMeierFitter
import statsmodels.formula.api as smf
# Bayesian modeling imports (e.g., pymc3 or cmdstanpy) if installed
# from pymc3 import Model, Normal, HalfCauchy, sample

# 1. Time-Series Decomposition (STL)
ts = df.set_index('Invoice Date').resample('M')['Total Sales'].sum().asfreq('M')
stl = STL(ts).fit()
stl.plot();

# 2. Vector Auto-Regression (VAR)
df_ts = df.set_index('Invoice Date').resample('M')[['Total Sales','Units Sold','Operating Profit']].sum()
model_var = VAR(df_ts.dropna())
var_res = model_var.fit(maxlags=4, ic='aic')
forecast_var = var_res.forecast(df_ts.values[-var_res.k_ar:], steps=6)
print("VAR forecast (next 6 months):")
print(pd.DataFrame(forecast_var, columns=df_ts.columns))

# 3. Generalized Additive Model (GAM)
X_gam = df[['Price per Unit','Operating Margin']].dropna()
y_gam = df.loc[X_gam.index, 'Units Sold']
gam = LinearGAM(s(0) + s(1)).fit(X_gam, y_gam)
print(gam.summary())

# 4. Principal Component Analysis (PCA)
num_feats = ['Price per Unit','Units Sold','Total Sales','Operating Profit','Operating Margin']
X_pca = df[num_feats].dropna()
pca = PCA(n_components=2)
components = pca.fit_transform(X_pca)
print("Explained variance ratios:", pca.explained_variance_ratio_)

# 5. K-Means Clustering on RFM
# Compute RFM
rfm = df.groupby('Retailer').agg({
    'Invoice Date': lambda x: (df['Invoice Date'].max() - x.max()).days,
    'Invoice Date': 'count',
    'Total Sales': 'sum'
}).rename(columns={'Invoice Date': 'Recency', 'Invoice Date': 'Frequency', 'Total Sales': 'Monetary'})
# Fit KMeans
kmeans = KMeans(n_clusters=4, random_state=42).fit(rfm)
rfm['Segment'] = kmeans.labels_
print(rfm.head())

# 6. Survival Analysis (Time-to-repeat purchase)
# Placeholder: need next purchase dates per customer
def survival_analysis(df):
    kmf = KaplanMeierFitter()
    # T = duration, E = event observed
    # kmf.fit(T, event_observed=E)
    # kmf.plot()
    pass

# 7. Quantile Regression
mod = smf.quantreg('Units_Sold ~ Q("Price per Unit")', df)
res_q10 = mod.fit(q=0.1)
print(res_q10.summary())

# 8. Bayesian Hierarchical Model stub
# Placeholder: build multi-level model with PyMC3 or CmdStan
# def bayesian_hierarchical(df):
#     with Model() as model:
#         # define priors and likelihood
#         trace = sample()
#     return trace

# 9. Extreme Value Theory (GPD)
from scipy.stats import genpareto
threshold = df['Total Sales'].quantile(0.95)
exceedances = df['Total Sales'][df['Total Sales'] > threshold] - threshold
params = genpareto.fit(exceedances)
print("GPD parameters:", params)

# 10. Market-Basket Analysis stub
# Requires invoice-level SKU data in transaction format
# from mlxtend.frequent_patterns import apriori, association_rules
# basket = df.groupby(['InvoiceNo', 'Product'])['Units Sold'].sum().unstack().fillna(0)
# frequent_itemsets = apriori(basket, min_support=0.01, use_colnames=True)
# rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
# print(rules.head())


In [None]:
'''
Advanced Statistical Analysis Template
Dataset: Adidas US Sales Data (cleaned DataFrame `df`)
'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.api import VAR
from pygam import LinearGAM, s
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from lifelines import KaplanMeierFitter
import statsmodels.formula.api as smf
from sklearn.feature_selection import mutual_info_regression
from scipy.stats import pointbiserialr
import scipy.stats as ss
import dcor

# Correlation Analysis and Visualizations

# 1. Pearson Correlation
pearson_corr = df.corr(method='pearson')
plt.figure(figsize=(10, 6))
sns.heatmap(pearson_corr, annot=True, cmap='coolwarm')
plt.title('Pearson Correlation')
plt.show()

# 2. Spearman Correlation
spearman_corr = df.corr(method='spearman')
plt.figure(figsize=(10, 6))
sns.heatmap(spearman_corr, annot=True, cmap='viridis')
plt.title('Spearman Correlation')
plt.show()

# 3. Kendall Correlation
kendall_corr = df.corr(method='kendall')
plt.figure(figsize=(10, 6))
sns.heatmap(kendall_corr, annot=True, cmap='magma')
plt.title('Kendall Correlation')
plt.show()

# 4. Point-Biserial Correlation (example on binary vs numeric)
binary_col = 'Sales Method'
numeric_col = 'Total Sales'
# encode binary column
df[binary_col] = df[binary_col].astype('category').cat.codes
r_pb, p_pb = pointbiserialr(df[binary_col], df[numeric_col])
print(f"Point-Biserial Correlation between {binary_col} and {numeric_col}: {r_pb:.3f} (p={p_pb:.3f})")

# 5. Cramer's V function for categorical vs categorical
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    return np.sqrt(phi2 / min(k - 1, r - 1))

cat_cols = df.select_dtypes(include='object').columns.tolist()
for i in range(len(cat_cols)):
    for j in range(i + 1, len(cat_cols)):
        v = cramers_v(df[cat_cols[i]], df[cat_cols[j]])
        print(f"Cramer's V between {cat_cols[i]} and {cat_cols[j]}: {v:.3f}")

# 6. Mutual Information
numeric_features = df.select_dtypes(include=np.number).columns.tolist()
mi = mutual_info_regression(df[numeric_features].drop('Total Sales', axis=1), df['Total Sales'])
mi_df = pd.Series(mi, index=numeric_features[:-1]).sort_values(ascending=False)
plt.figure(figsize=(10, 5))
mi_df.plot(kind='bar', color='skyblue')
plt.title('Mutual Information with Total Sales')
plt.ylabel('Mutual Information')
plt.show()

# 7. Distance Correlation
for col in numeric_features:
    if col != 'Total Sales':
        dc = dcor.distance_correlation(df['Total Sales'], df[col])
        print(f"Distance Correlation between Total Sales and {col}: {dc:.3f}")


In [None]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

# 1. Load cleaned data (ensure this file exists in /mnt/data)
df =joblib.load('AdidasPreprocessed.pkl')

# 2. Prepare features and target
# Drop columns not used for prediction
X = df.drop(columns=['Total Sales', 'Calculated Sales', 'Sales Discrepancy', 'Invoice Date', 'Month', 'Year'])
y = df['Total Sales']

# 3. Identify numeric vs categorical features
numeric_features = X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object','category']).columns.tolist()

# 4. Preprocessor pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
], remainder='drop')

# 5. Define models
models = {
    'LinearRegression': LinearRegression(),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
}

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Train, predict & evaluate
results = []
pipelines = {}
for name, model in models.items():
    pipe = Pipeline([
        ('preproc', preprocessor),
        ('model', model)
    ])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    rmse = mean_squared_error(y_test, preds, squared=False)
    r2 = r2_score(y_test, preds)
    results.append({'Model': name, 'MAE': mae, 'RMSE': rmse, 'R2': r2})
    pipelines[name] = pipe

# 8. Display performance
import ace_tools as tools; tools.display_dataframe_to_user(name="Regression Model Comparison", dataframe=pd.DataFrame(results))

# 9. Plot Actual vs Predicted
for name, pipe in pipelines.items():
    preds = pipe.predict(X_test)
    plt.figure(figsize=(6,6))
    plt.scatter(y_test, preds, alpha=0.3)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.xlabel('Actual Total Sales')
    plt.ylabel('Predicted Total Sales')
    plt.title(f'Actual vs Predicted: {name}')
    plt.tight_layout()
    plt.show()


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values