In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

1. Find the least amount sale that was done for each item.

In [2]:
sales_df = pd.read_excel('SalesData.xlsx')

min_sales_by_item = sales_df.groupby('Item')['Sale_amt'].min().reset_index()
min_sales_by_item.rename(columns={'Sale_amt': 'Min_Sale_Amount'}, inplace=True)

print("Least amount sale for each item:")
print(min_sales_by_item)

Least amount sale for each item:
           Item  Min_Sale_Amount
0    Cell Phone           3375.0
1          Desk            250.0
2  Home Theater           2000.0
3    Television           8386.0
4   Video Games            936.0


2. Compute the total sales for each year and region across all items

In [3]:
# Extract year from OrderDate
sales_df['Year'] = pd.to_datetime(sales_df['OrderDate']).dt.year

total_sales_by_year_region = sales_df.groupby(['Year', 'Region'])['Sale_amt'].sum().reset_index()

print("Total sales for each year and region:")
print(total_sales_by_year_region)

Total sales for each year and region:
   Year   Region  Sale_amt
0  2018  Central  479825.0
1  2018     East  293780.0
2  2018     West  105424.0
3  2019  Central  349944.5
4  2019     East   27227.0
5  2019     West   49475.0


3. Create new column 'days_diff' with number of days difference between reference date passed and each order date

In [4]:
# Convert OrderDate to datetime
sales_df['OrderDate'] = pd.to_datetime(sales_df['OrderDate'])

reference_date = pd.to_datetime('2018-12-31')

# Calculate days difference
sales_df['days_diff'] = (reference_date - sales_df['OrderDate']).dt.days

print("DataFrame with days_diff column:")
print(sales_df.head())

DataFrame with days_diff column:
   OrderDate   Region  Manager   SalesMan          Item  Units  Unit_price  \
0 2018-01-06     East   Martha  Alexander    Television     95      1198.0   
1 2018-01-23  Central  Hermann     Shelli  Home Theater     50       500.0   
2 2018-02-09  Central  Hermann       Luis    Television     36      1198.0   
3 2018-02-26  Central  Timothy      David    Cell Phone     27       225.0   
4 2018-03-15     West  Timothy    Stephen    Television     56      1198.0   

   Sale_amt  Year  days_diff  
0  113810.0  2018        359  
1   25000.0  2018        342  
2   43128.0  2018        325  
3    6075.0  2018        308  
4   67088.0  2018        291  


4. Create a dataframe with two columns: 'manager', 'list_of_salesmen'

In [5]:
manager_salesmen = sales_df.groupby('Manager')['SalesMan'].unique().reset_index()
manager_salesmen.columns = ['manager', 'list_of_salesmen']

print("DataFrame with manager and list of salesmen:")
print(manager_salesmen)

DataFrame with manager and list of salesmen:
   manager            list_of_salesmen
0  Douglas      [Michael, Karen, John]
1  Hermann       [Shelli, Luis, Sigal]
2   Martha  [Alexander, Steven, Diana]
3  Timothy            [David, Stephen]


5. For all regions find number of salesman and total sales

In [6]:
region_stats = sales_df.groupby('Region').agg(
    salesmen_count=('SalesMan', 'nunique'),
    total_sales=('Sale_amt', 'sum')
).reset_index()

print("Region statistics:")
print(region_stats)

Region statistics:
    Region  salesmen_count  total_sales
0  Central               6     829769.5
1     East               3     321007.0
2     West               2     154899.0


6. Create a dataframe with total sales as percentage for each manager

In [7]:
# Calculate total sales for each manager
manager_sales = sales_df.groupby('Manager')['Sale_amt'].sum()

# Calculate overall total sales
total_sales = sales_df['Sale_amt'].sum()

# Calculate percentage
manager_percent = (manager_sales / total_sales * 100).reset_index()
manager_percent.columns = ['manager', 'percent_sales']

print("Manager sales percentage:")
print(manager_percent)

Manager sales percentage:
   manager  percent_sales
0  Douglas      18.308990
1  Hermann      27.963188
2   Martha      36.187629
3  Timothy      17.540193


7. Get the imdb rating for fifth movie of dataframe

In [8]:
# Use on_bad_lines='skip'
imdb_df = pd.read_csv('imdb.csv', on_bad_lines='skip')

# Get IMDB rating for fifth movie (assuming 0-indexed)
fifth_movie_rating = imdb_df.iloc[4]['imdbRating']
print(f"IMDB rating for fifth movie: {fifth_movie_rating}")

IMDB rating for fifth movie: 8.7


8. Return titles of movies with shortest and longest run time

In [9]:
shortest_movie = imdb_df.loc[imdb_df['duration'].idxmin()]
longest_movie = imdb_df.loc[imdb_df['duration'].idxmax()]

print(f"Shortest movie: {shortest_movie['title']} (Duration: {shortest_movie['duration']} seconds)")
print(f"Longest movie: {longest_movie['title']} (Duration: {longest_movie['duration']} seconds)")

Shortest movie: Traffic Crossing Leeds Bridge (1888) (Duration: 2.0 seconds)
Longest movie: Baseball The National Pastime (TV Episode 1994) (Duration: 68400.0 seconds)


9. Sort by release date (earliest) and IMDB rating (highest to lowest)

In [10]:
sorted_imdb = imdb_df.sort_values(by=['year', 'imdbRating'], ascending=[True, False])

print("Sorted by release date and rating:")
print(sorted_imdb[['title', 'year', 'imdbRating']].head())

Sorted by release date and rating:
                                        title    year  imdbRating
13605            Roundhay Garden Scene (1888)  1888.0         7.8
13282    Traffic Crossing Leeds Bridge (1888)  1888.0         7.2
6705                  Blacksmith Scene (1893)  1893.0         6.3
12316  Dickson Experimental Sound Film (1894)  1894.0         6.8
6706            The Kiss in the Tunnel (1899)  1899.0         5.9


10. Subset with movies having duration between 30 and 180 minutes

In [11]:
# Convert duration from seconds to minutes
imdb_df['duration_minutes'] = imdb_df['duration'] / 60

# Filter movies with duration between 30 and 180 minutes
filtered_movies = imdb_df[(imdb_df['duration_minutes'] >= 30) & (imdb_df['duration_minutes'] <= 180)]

print(f"Number of movies with duration between 30 and 180 minutes: {len(filtered_movies)}")
print(filtered_movies[['title', 'duration_minutes']].head())

Number of movies with duration between 30 and 180 minutes: 11952
                              title  duration_minutes
0  Der Vagabund und das Kind (1921)              54.0
1                 Goldrausch (1925)              95.0
2                 Metropolis (1927)             153.0
3                Der General (1926)             107.0
4      Lichter der Großstadt (1931)              87.0


11. Count the duplicate rows of diamonds DataFrame

In [12]:
diamonds_df = pd.read_csv('diamonds.csv')

duplicate_count = diamonds_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

Number of duplicate rows: 149


12. Drop rows in case of missing values in carat and cut columns

In [13]:
cleaned_diamonds = diamonds_df.dropna(subset=['carat', 'cut'])

print(f"Original row count: {len(diamonds_df)}")
print(f"Row count after dropping missing values: {len(cleaned_diamonds)}")

Original row count: 53943
Row count after dropping missing values: 53941


13. Subset the dataframe with only numeric columns

In [14]:
numeric_diamonds = diamonds_df.select_dtypes(include=[np.number])

print("DataFrame with only numeric columns:")
print(numeric_diamonds.head())
print(f"Numeric columns: {numeric_diamonds.columns.tolist()}")

DataFrame with only numeric columns:
   depth  table  price     x     y     z
0   61.5   55.0  326.0  3.95  3.98  2.43
1   59.8   61.0  326.0  3.89  3.84  2.31
2   56.9   65.0  327.0  4.05  4.07  2.31
3   62.4   58.0  334.0  4.20  4.23  2.63
4   63.3   58.0  335.0  4.34  4.35  2.75
Numeric columns: ['depth', 'table', 'price', 'x', 'y', 'z']


14. Compute volume as (xyz) when depth is greater than 60, otherwise default to 8

In [15]:
diamonds_df['volume'] = np.where(
    diamonds_df['depth'] > 60,
    diamonds_df['x'] * diamonds_df['y'] * diamonds_df['z'],
    8
)

print("DataFrame with volume column:")
print(diamonds_df[['carat', 'depth', 'x', 'y', 'z', 'volume']].head())

DataFrame with volume column:
  carat  depth     x     y     z    volume
0  0.23   61.5  3.95  3.98  2.43  38.20203
1  0.21   59.8  3.89  3.84  2.31   8.00000
2  0.23   56.9  4.05  4.07  2.31   8.00000
3  0.29   62.4  4.20  4.23  2.63  46.72458
4  0.31   63.3  4.34  4.35  2.75  51.91725


15. Impute missing price values with mean

In [16]:
mean_price = diamonds_df['price'].mean()

# Fill missing price values with mean
diamonds_df['price'] = diamonds_df['price'].fillna(mean_price)

print(f"Mean price used for imputation: {mean_price}")
print("DataFrame after imputing missing prices:")
print(diamonds_df[['carat', 'cut', 'price']].head())

Mean price used for imputation: 3932.8585253712527
DataFrame after imputing missing prices:
  carat      cut  price
0  0.23    Ideal  326.0
1  0.21  Premium  326.0
2  0.23     Good  327.0
3  0.29  Premium  334.0
4  0.31     Good  335.0
