In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
mail_data = pd.read_csv('mail_data.csv')

In [3]:
mail_data.shape

(5572, 2)

In [4]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
mail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
mail_data.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [7]:
mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [8]:
mail_data['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [9]:
#replace the null value with null string
mail_data=mail_data.where((pd.notnull(mail_data)),'')

In [10]:
mail_data.loc[mail_data['Category']=='spam','Category',]=0
mail_data.loc[mail_data['Category']=='ham','Category',]=1
#mail_data.replace({'Category':{'spam':0,'ham':1}},inplace=True)

In [11]:
mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
data = mail_data['Message']
#data = mail_data.drop(['Category'],axis = 1)
label = mail_data['Category']

In [13]:
data.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(data,label,test_size=.2,random_state=3)

In [15]:
print(data.shape,X_train.shape,X_test.shape)

(5572,) (4457,) (1115,)


In [16]:
#transform the text data to features vectors that csn be used as an input to the logistic regression
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english',lowercase='True')

In [17]:
X_train_features=feature_extraction.fit_transform(X_train)
X_test_features=feature_extraction.fit_transform(X_test)

In [18]:
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [19]:
print( X_train_features)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

In [20]:
model = LogisticRegression()

In [21]:
model.fit(X_train_features,Y_train)

In [22]:
X_train_predict = model.predict(X_train_features)

X_train_accuracy = accuracy_score(Y_train,X_train_predict)
print(X_train_accuracy)

0.9670181736594121


In [23]:
X_test_predict = model.predict(X_test_features)

X_test_accuracy = accuracy_score(Y_test,X_test_predict)

print(X_test_accuracy)

In [24]:
input_mail = ['Ok lar... Joking wif u oni...']

input_data_features = feature_extraction.transform(input_mail)

prediction = model.predict(input_data_features)
#print(prediction)

if prediction == 0:
    print('Ham Mail')
else:
    print('Spam Mail')

ValueError: X has 3296 features, but LogisticRegression is expecting 7431 features as input.