In [16]:
# Loading dataset
import pandas as pd
from pandas import read_csv
import numpy as np
filename = 'sales.csv'
data = read_csv(filename)
data.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month
0,,2,500,300
1,,4,300,650
2,four,600,200,400
3,nine,450,320,650
4,seven,600,250,350


In [17]:
# Checking missing values
data.isnull().sum()

rate                     2
sales_in_first_month     0
sales_in_second_month    0
sales_in_third_month     0
dtype: int64

In [18]:
# Mean of columns
data.mean()

sales_in_first_month     367.666667
sales_in_second_month    295.000000
sales_in_third_month     508.333333
dtype: float64

In [19]:
# Filling missing values
data['rate'].fillna(0, inplace=True)
data['sales_in_first_month'].fillna(data['sales_in_first_month'].mean(), inplace=True)
data.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month
0,0,2,500,300
1,0,4,300,650
2,four,600,200,400
3,nine,450,320,650
4,seven,600,250,350


In [20]:
# Feature selection
X = data.iloc[:, :3]
y = data.iloc[:, -1]

In [21]:
X.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month
0,0,2,500
1,0,4,300
2,four,600,200
3,nine,450,320
4,seven,600,250


In [22]:
y.head()

0    300
1    650
2    400
3    650
4    350
Name: sales_in_third_month, dtype: int64

In [23]:
# Convert words to numbers
def convert_to_int(word):
    word_dict = {'one':1, 'two':2, 'three':3, 'four':4, 'five':5, 'six':6, 'seven':7, 'eight':8, 'nine':9, 'ten':10, 'eleven':11, 'twelve':12, 'zero':0, 0:0}
    return word_dict[word]                 

In [24]:
X['rate'] = X['rate'].apply(lambda x : convert_to_int(x))

In [25]:
X.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month
0,0,2,500
1,0,4,300
2,4,600,200
3,9,450,320
4,7,600,250


In [29]:
# Conctatenate
df = pd.concat([X,y], axis=1)
df.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month
0,0,2,500,300
1,0,4,300,650
2,4,600,200,400
3,9,450,320,650
4,7,600,250,350


In [33]:
df.dtypes

rate                     int64
sales_in_first_month     int64
sales_in_second_month    int64
sales_in_third_month     int64
dtype: object

In [35]:
# Saving clean dataset
df.to_csv('clean_sales.csv')

In [36]:
# Fitting the model
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
clf.fit(X, y)
clf.score(X, y)

0.6948637514051954

In [37]:
# Saving a model
import pickle
pickle.dump(clf, open('model.pkl', 'wb'))

In [39]:
# Making prediction
model = pickle.load(open('model.pkl', 'rb'))
print(model.predict([[4, 300, 500]]))

[143.3072588]
