In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**First Install chartdet library**

In [None]:
!pip install chardet

**This Python code snippet reads multiple CSV files from a specified directory using the `glob` module. It iterates through each file, detects its encoding using the `chardet` library, and prints the detected encoding. 
If the index of the file is not 2, it proceeds to print information about the file, including its name, detected encoding, and the shape of the DataFrame after reading the CSV file using Pandas.
The printed information includes the file name, detected encoding, and the shape of the DataFrame created by reading the CSV file with the detected encoding.
**

In [None]:
import pandas as pd
from glob import glob
from chardet import detect
input_files = glob(pathname='/kaggle/input/world-war-2-archive/*.csv', recursive=False)
for index, input_file in enumerate(input_files):
    detect_result = detect(open(file=input_file, mode= 'rb').read())
    print(detect_result)
    if index != 2:
        print('file: {} encoding: {} shape: {}'.format(input_file, detect_result['encoding'], 
                                                      pd.read_csv(filepath_or_buffer=input_file, encoding=detect_result['encoding']).shape))

In [None]:
from numpy import nan
ships_df = pd.read_csv(filepath_or_buffer='/kaggle/input/world-war-2-archive/ships.csv', index_col=[0])
ships_df['year'] = ships_df['Launch Year'].apply(func=lambda x: nan if len(x.strip()) != 4 or x.strip() == '0000' else int(x))
ships_df.head()

**Total Columns in a Dataset**

In [None]:
ships_df.info()

**The provided Python code utilizes Plotly Express to create a histogram from the DataFrame 'ships_df'. The histogram is plotted based on the 'year' column, with each bar colored according to the respective countries represented in the DataFrame. It sorts the DataFrame by the 'Country' column before plotting to ensure a consistent ordering in the color scheme.**


In [None]:
from plotly.express import histogram
histogram(data_frame=ships_df.sort_values(by='Country'), x='year', color='Country')

**This Python code snippet utilizes Plotly Express to create a bar chart. It aggregates data from the DataFrame 'ships_df' by grouping it based on the 'Class' and 'Country' columns, then calculates the size of each group. The result is then reset to form a new DataFrame. The 'Class' values are plotted on the y-axis, the count of occurrences on the x-axis, and each bar is colored according to the respective countries. The 'height' parameter is set to 5000 to adjust the height of the visualization.**


In [None]:
from plotly.express import bar
bar(data_frame=ships_df[['Country', 'Class']].groupby(by=['Class', 'Country']).size().reset_index().rename({0: 'count'}, axis=1),
   y='Class', x='count', color='Country', height=5000)

**This Python code snippet utilizes Matplotlib to create a word cloud visualization. It imports functions from Matplotlib for subplots, axis manipulation, and image display. Additionally, it imports the WordCloud class from the wordcloud library. 
The word cloud is generated using the 'Class' column values from the 'ships_df' DataFrame, joined into a single string. The WordCloud object is configured with specific parameters like 'random_state', 'height', 'width', and a list of stopwords. 
The resulting word cloud is displayed with subplots configured for a size of 12x12 inches, and axis turned off for a cleaner visualization.**


In [None]:
from matplotlib.pyplot import subplots
from matplotlib.pyplot import axis
from matplotlib.pyplot import imshow
from wordcloud import WordCloud
subplots(figsize=(12, 12))
imshow(X=WordCloud(random_state=2023, height=1200, width=1200, stopwords=['class'] ).generate(text=' '.join(ships_df['Class'].values), ))
axis('off')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12, 6))

# 1. Histogram of Launch Year
plt.subplot(2, 3, 1)
sns.histplot(ships_df['Launch Year'], bins=20, kde=True)
plt.title('Histogram of Launch Year')

**Sub-plot of growth of ships over the years**

In [None]:
plt.subplot(2, 1, 2)
ship_launches_over_years = ships_df['year'].value_counts().sort_index()
ship_launches_over_years.plot()
plt.title('Ship Launches Over Years')
plt.xlabel('Year')
plt.ylabel('Count')


***Scatter plot of launch year and ship count****

In [None]:
plt.subplot(2, 1, 2)
sns.scatterplot(x='Launch Year', y='year', data=ships_df, alpha=0.5)
plt.title('Launch Year vs. Ship Count')
plt.xlabel('Launch Year')
plt.ylabel('Count')

**SUMMARY!**

In [None]:
summary_stats = ships_df[['Launch Year', 'year']].describe()
print("Summary Statistics:")
print(summary_stats)

**COUNTRY - CLASS analysis between data**

In [None]:
country_class_cross_tab = pd.crosstab(ships_df['Country'], ships_df['Class'])
plt.figure(figsize=(12, 8))
sns.heatmap(country_class_cross_tab, cmap='Blues', annot=True, fmt='d')
plt.title('Country vs Class Analysis')
plt.xlabel('Class')
plt.ylabel('Country')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=ships_df[['Launch Year', 'year']], palette='Set2')
plt.title('Outlier Detection: Launch Year and Year')
plt.ylabel('Year')
plt.show()