# Loading, pre processing and saving

In [1]:
# Loading the dataset
import pandas as pd
import numpy as np

filename = 'sales.csv'
data = pd.read_csv(filename)
data.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month
0,,2,500,300
1,,4,300,650
2,four,600,200,400
3,nine,450,320,650
4,seven,600,250,350


In [2]:
# Checking missing values
data.isnull().sum()

rate                     2
sales_in_first_month     0
sales_in_second_month    0
sales_in_third_month     0
dtype: int64

In [3]:
# Filling missing values
data['rate'].fillna(0, inplace=True)
data['sales_in_first_month'].fillna(data['sales_in_first_month'].mean(), inplace=True)
data.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['rate'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['sales_in_first_month'].fillna(data['sales_in_first_month'].mean(), inplace=True)


Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month
0,0,2,500,300
1,0,4,300,650
2,four,600,200,400
3,nine,450,320,650
4,seven,600,250,350


In [4]:
# Feature selection
X = data.iloc[:, :3]
y = data.iloc[:, -1]

In [5]:
X.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month
0,0,2,500
1,0,4,300
2,four,600,200
3,nine,450,320
4,seven,600,250


In [6]:
y.head()

0    300
1    650
2    400
3    650
4    350
Name: sales_in_third_month, dtype: int64

In [11]:
# Convert words to number

def convert_to_int(word):
    word_dict = {'one':1, 'two':2, 'three':3, 'four':4, 'five':5, 'six':6, 'seven':7, 'eight':8, 'nine':9, 'ten':10, 'eleven':11, 'twelve':12, 'zero':0, 0:0}
    return word_dict[word]                 

In [12]:
X['rate'] = X['rate'].apply(lambda x : convert_to_int(x))
X.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month
0,0,2,500
1,0,4,300
2,4,600,200
3,9,450,320
4,7,600,250


In [13]:
# Concatenate
df = pd.concat([X,y], axis = 1)
df.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month
0,0,2,500,300
1,0,4,300,650
2,4,600,200,400
3,9,450,320,650
4,7,600,250,350


In [15]:
df.dtypes

rate                     int64
sales_in_first_month     int64
sales_in_second_month    int64
sales_in_third_month     int64
dtype: object

In [16]:
# Saving cleaned dataset
df.to_csv('clean_sales.csv')

# Linear Regression

In [17]:
# Fitting the model 
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
clf.fit(X,y)
clf.score(X,y)

0.6948637514051953

In [18]:
# Saving the model
import pickle
pickle.dump(clf, open('model.pkl', 'wb'))

In [20]:
# Making prediction
model = pickle.load(open('model.pkl', 'rb'))
print(model.predict([[4, 300, 500]]))

[143.3072588]


