In [5]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [6]:


file_path = 'training.1600000.processed.noemoticon.csv'




In [7]:
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)
print("Initial target distribution (before mapping):")
print(data[0].value_counts())

Initial target distribution (before mapping):
0
0    800000
4    800000
Name: count, dtype: int64


In [8]:
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
col_names = ['target','id','date','flag','user','text']
data.columns = col_names
data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [10]:
data['target'] = data['target'].map({0:0, 4:1})

In [11]:
print(data['target'].value_counts())

target
0    800000
1    800000
Name: count, dtype: int64


In [12]:
stremmer = PorterStemmer()

negation_words = set([
    'no', 'not', 'never', 'none', 'nobody', 'nothing', 'neither', 'nowhere',
    'hardly', 'scarcely', 'barely'
])

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [stremmer.stem(word) for word in stemmed_content
                       if word not in stopwords.words('english')
                       and word not in negation_words]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [13]:
data['text'] = data['text'].apply(stemming)

In [14]:
print("\nClass distribution after sampling:")
print(data['target'].value_counts())


Class distribution after sampling:
target
0    800000
1    800000
Name: count, dtype: int64


In [15]:
X = data['text'].values
y = data['target'].values

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=0.2)

In [17]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Include n-grams
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

In [18]:
print(type(X_train_vectors))
print(X_train_vectors.shape)

<class 'scipy.sparse._csr.csr_matrix'>
(1280000, 5000)


In [19]:
model = LogisticRegression()
model.fit(X_train_vectors,y_train)

In [20]:
y_pred = model.predict(X_test_vectors)
print(accuracy_score(y_test,y_pred))

0.76831875


In [21]:
def predict_sentiment(text):
    text = re.sub('[^a-zA-Z]',' ',text) # removing not a-z and A-Z
    text = text.lower()
    text = text.split()
    text = [stremmer.stem(word) for word in text if not word in stopwords.words('english')]
    text = ' '.join(text)
    text = [text]
    text = vectorizer.transform(text)
    sentiment = model.predict(text)
    if sentiment == 0:
        return "Negative"
    else:
        return "Positive"

In [22]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized)
