In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from matplotlib.ticker import FuncFormatter

# Set the style for academic publication
plt.style.use('seaborn-whitegrid')
sns.set_context("paper", font_scale=1.3)
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.family'] = 'serif'

# Convert the data to a DataFrame
family_counts = {
    "sfone": 4622, "upatre": 3658, "wacatac": 3519, "wabot": 3454, "small": 3290,
    "mira": 1935, "berbew": 1710, "dinwod": 1586, "sillyp2p": 1604, "ceeinject": 1126,
    "musecador": 1027, "gandcrab": 952, "autoit": 912, "drolnux": 919, "smokeloader": 862,
    "unruy": 850, "gepys": 802, "qukart": 807, "ganelp": 751, "padodor": 700
}

df = pd.DataFrame(list(family_counts.items()), columns=['Family', 'Count'])

# Figure 1: Distribution of top malware families
plt.figure(figsize=(12, 6))
sns.barplot(data=df.head(15), x='Family', y='Count', color='darkblue', alpha=0.8)
plt.xticks(rotation=45, ha='right')
plt.xlabel('Malware Family')
plt.ylabel('Number of Samples')
plt.title('Distribution of Top 15 Malware Families')
plt.tight_layout()
plt.savefig('malware_distribution.pdf', dpi=300, bbox_inches='tight')
plt.close()

# Figure 2: Sample size distribution (log scale)
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='Count', bins=30, color='darkblue', alpha=0.8)
plt.xscale('log')
plt.xlabel('Number of Samples (log scale)')
plt.ylabel('Number of Families')
plt.title('Distribution of Malware Family Sizes')
plt.tight_layout()
plt.savefig('malware_size_distribution.pdf', dpi=300, bbox_inches='tight')
plt.close()

# Figure 3: Cumulative distribution
df_sorted = df.sort_values('Count', ascending=False)
df_sorted['Cumulative_Percentage'] = df_sorted['Count'].cumsum() / df_sorted['Count'].sum() * 100

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(df_sorted) + 1), df_sorted['Cumulative_Percentage'], 
         color='darkblue', linewidth=2, marker='o', markersize=4)
plt.xlabel('Number of Families')
plt.ylabel('Cumulative Percentage of Samples')
plt.title('Cumulative Distribution of Malware Samples')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('malware_cumulative.pdf', dpi=300, bbox_inches='tight')
plt.close()

# Figure 4: Box plot with violin plot overlay
plt.figure(figsize=(12, 6))
sns.violinplot(data=df.head(15), x='Family', y='Count', color='lightblue', alpha=0.5)
sns.boxplot(data=df.head(15), x='Family', y='Count', width=0.2, 
            color='white', fliersize=0, showmeans=True, 
            meanprops={"marker":"s","markerfacecolor":"white", "markeredgecolor":"darkblue"})
plt.xticks(rotation=45, ha='right')
plt.xlabel('Malware Family')
plt.ylabel('Number of Samples')
plt.title('Distribution Characteristics of Top 15 Malware Families')
plt.tight_layout()
plt.savefig('malware_violin.pdf', dpi=300, bbox_inches='tight')
plt.close()

# LaTeX-compatible table of statistics
stats_df = df.describe()
with open('malware_stats.tex', 'w') as f:
    f.write(stats_df.to_latex(float_format=lambda x: '{:.2f}'.format(x)))

# Optional: Generate TikZ code for the cumulative distribution
with open('malware_tikz.tex', 'w') as f:
    f.write(r'\begin{tikzpicture}' + '\n')
    f.write(r'\begin{axis}[' + '\n')
    f.write(r'  xlabel={Number of Families},' + '\n')
    f.write(r'  ylabel={Cumulative Percentage of Samples},' + '\n')
    f.write(r'  grid=major,' + '\n')
    f.write(r'  width=12cm,' + '\n')
    f.write(r'  height=8cm' + '\n')
    f.write(r']' + '\n')
    
    # Write coordinates
    f.write(r'\addplot[thick,blue,mark=*] coordinates {' + '\n')
    for idx, row in df_sorted.reset_index().iterrows():
        f.write(f'({idx+1},{row["Cumulative_Percentage"]:.2f})')
    f.write(r'};' + '\n')
    
    f.write(r'\end{axis}' + '\n')
    f.write(r'\end{tikzpicture}')

print("Generated plots and files:")
print("- malware_distribution.pdf: Bar plot of top 15 families")
print("- malware_size_distribution.pdf: Log-scale distribution of family sizes")
print("- malware_cumulative.pdf: Cumulative distribution plot")
print("- malware_violin.pdf: Combined violin and box plot")
print("- malware_stats.tex: LaTeX table of summary statistics")
print("- malware_tikz.tex: TikZ code for cumulative distribution")