In [69]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [70]:
data = """ Impact of Infrastructure on Agricultural Productivity
Problem Statement: How does the availability and accessibility of agricultural infrastructure (e.g., irrigation facilities, soil testing centers, and fertilizer shops) influence agricultural productivity in rural villages? Possible Analysis:
•	Correlation analysis between the availability of agricultural infrastructure and net sown area.
•	Regression analysis to predict agricultural productivity based on infrastructure availability.
•	Comparative study of villages with high and low agricultural productivity to identify key infrastructural differences.

Possible Analysis:
1.	Correlation Analysis:
o	Investigate the relationship between the availability of agricultural infrastructure (such as irrigation facilities, soil testing centers, and fertilizer shops) and the net sown area.
o	Identify which infrastructure elements are most strongly associated with increased agricultural productivity.
2.	Regression Analysis:
o	Develop regression models to predict agricultural productivity based on the availability and accessibility of various agricultural infrastructure elements.
o	Include other variables such as demographic factors and geographic location to control for external influences.
3.	Comparative Study:
o	Conduct a comparative analysis of villages with high and low agricultural productivity.
o	Identify key infrastructural differences between these villages to determine which types of infrastructure investments are most effective in enhancing productivity.
o	Use qualitative and quantitative methods to provide a comprehensive understanding of the impact of infrastructure on agricultural productivity.
Core Agricultural Infrastructure Features:
1.	is_govt_seed_centre_available - Availability of government seed centers ensures the supply of quality seeds.
2.	availability_of_watershed_dev_project - Watershed development projects help in efficient water management.
3.	availability_of_rain_harvest_system - Rainwater harvesting systems improve water availability for agriculture.
4.	availability_of_fpos_pacs - Farmer Producer Organizations and Primary Agricultural Credit Societies support farmers in various agricultural activities.
5.	availability_of_food_storage_warehouse - Food storage warehouses help in reducing post-harvest losses.
6.	availability_of_farm_gate_processing - On-site processing facilities increase the value of agricultural produce.
7.	availability_of_custom_hiring_centre_agri_equipment - Access to agricultural equipment can enhance farming efficiency.
8.	is_soil_testing_centre_available - Soil testing centers provide crucial information for optimized fertilizer use.
9.	is_fertilizer_shop_available - Availability of fertilizer shops ensures timely supply of fertilizers.
10.	availability_of_major_source_of_irrigation - Major sources of irrigation are critical for consistent water supply to crops.
11.	no_of_farmers_using_drip_sprinkler - Adoption of efficient irrigation methods like drip and sprinkler systems.
12.	area_irrigated_in_hac - The extent of irrigated area directly impacts agricultural productivity.
Agricultural Productivity Features:
13.	total_cultivable_area_in_hac - The total area available for cultivation.
14.	net_sown_area_in_hac - The actual area sown with crops, indicating land use efficiency.
Key Observations:
is_fertilizer_shop_available (Correlation: 0.183):
The availability of a fertilizer shop has the highest positive correlation with net_sown_area_in_hac among all the features, though it is still relatively weak. This suggests that villages with fertilizer shops might tend to have a slightly larger net sown area. Fertilizer availability could contribute to better crop yields, potentially encouraging farmers to cultivate more land.
is_govt_seed_centre_available (Correlation: 0.173):
There is a modest positive correlation between the presence of a government seed center and the net sown area. Access to quality seeds might promote more extensive agricultural practices, leading to a larger sown area.
availability_of_food_storage_warehouse (Correlation: 0.161):
The presence of food storage warehouses is positively correlated with the net sown area. This could indicate that in villages where more land is cultivated, there’s a need for more substantial storage facilities to manage the produce.
availability_of_fpos_pacs (Correlation: 0.128):
The availability of Farmer Producer Organizations (FPOs) or Primary Agricultural Credit Societies (PACS) also shows a positive correlation with the net sown area. This might suggest that villages with better organizational support for farmers tend to have more land under cultivation.
Irrigation Sources (Canal: 0.107, Ground Water: 0.024, Surface Water: -0.035):
o	The correlations between different irrigation sources and the net sown area are generally weak. However, a slight positive correlation with canal irrigation suggests that access to canal water might contribute to a larger cultivated area. The other irrigation sources show negligible or weak correlations.
6.	availability_of_watershed_dev_project (Correlation: 0.090):
o	The correlation is positive but weak, indicating that villages with watershed development projects might have a slightly larger net sown area, likely due to improved water management.
Conclusion:
Overall, while the correlations between net_sown_area_in_hac and other features are positive, they are relatively weak. The most influential features appear to be the availability of fertilizer shops, government seed centers, and food storage warehouses. These facilities may support more extensive agricultural practices, leading to a larger net sown area.
However, no single feature exhibits a strong correlation with the net sown area, indicating that the extent of land under cultivation is likely influenced by a combination of factors rather than a single dominant feature. This suggests that improving multiple aspects of agricultural infrastructure, rather than focusing on just one, could be more effective in increasing the net sown area in villages.
"""

In [71]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [72]:
tokenizer = Tokenizer()

In [73]:
tokenizer.fit_on_texts([data])

In [74]:
tokenizer.word_index

{'of': 1,
 'the': 2,
 'agricultural': 3,
 'availability': 4,
 'area': 5,
 'to': 6,
 'and': 7,
 'sown': 8,
 'a': 9,
 'net': 10,
 'in': 11,
 'correlation': 12,
 'with': 13,
 'infrastructure': 14,
 'productivity': 15,
 'is': 16,
 'fertilizer': 17,
 'irrigation': 18,
 'villages': 19,
 'o': 20,
 '0': 21,
 'analysis': 22,
 'for': 23,
 'water': 24,
 'that': 25,
 'more': 26,
 'on': 27,
 'between': 28,
 'are': 29,
 'available': 30,
 'storage': 31,
 'positive': 32,
 'facilities': 33,
 'centers': 34,
 'shops': 35,
 'features': 36,
 'seed': 37,
 'food': 38,
 'hac': 39,
 'land': 40,
 'weak': 41,
 'might': 42,
 'larger': 43,
 'soil': 44,
 'testing': 45,
 'centre': 46,
 'watershed': 47,
 'farmers': 48,
 'sources': 49,
 'this': 50,
 '•': 51,
 'regression': 52,
 'comparative': 53,
 'identify': 54,
 'key': 55,
 'most': 56,
 'other': 57,
 'use': 58,
 'government': 59,
 'supply': 60,
 'fpos': 61,
 'pacs': 62,
 'support': 63,
 'warehouses': 64,
 'access': 65,
 'shop': 66,
 'cultivation': 67,
 'indicating':

In [75]:
len(tokenizer.word_index)

294

In [76]:
input_sequences=[]
for sentence in data.split('\n'):
#     print(tokenizer.texts_to_sequences([sentence])[0])
      tokenized_sentence=tokenizer.texts_to_sequences([sentence])[0]
      for i in range(1,len(tokenized_sentence)):
            input_sequences.append(tokenized_sentence[:i+1])
    

In [77]:
# input_sequences

In [78]:
max_len = max([len(x) for x in input_sequences])

In [81]:
max_len

62

In [82]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequences,maxlen=max_len,padding = 'pre')

In [83]:
padded_input_sequences

array([[  0,   0,   0, ...,   0,  74,   1],
       [  0,   0,   0, ...,  74,   1,  14],
       [  0,   0,   0, ...,   1,  14,  27],
       ...,
       [  0,   0, 142, ...,  10,   8,   5],
       [  0, 142, 124, ...,   8,   5,  11],
       [142, 124, 145, ...,   5,  11,  19]], dtype=int32)

In [84]:
X = padded_input_sequences[:,:-1]

In [85]:
y= padded_input_sequences[:,-1]

In [86]:
X.shape

(805, 61)

In [87]:
y.shape

(805,)

In [89]:
from tensorflow.keras.utils import to_categorical
y= to_categorical(y,num_classes=295)

In [90]:
y.shape

(805, 295)

In [91]:
y

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [92]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense

In [97]:
model = Sequential()
model.add(Embedding(294, 100))
model.add(LSTM(150))
model.add(LSTM(150))
model.add(Dense(294, activation='softmax'))

model.summary()


In [98]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [99]:
model.fit(X,y,epochs=10)

Epoch 1/10


ValueError: Input 0 of layer "lstm_13" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 150)

In [None]:
import time
text = "mail"
# tokenize
for i in range(5):
    token_text = tokenizer.texts_to_sequences([text])[0]
# padding
    padded_token_text = pad_sequences([token_text], maxlen =63, padding= 'pre')
    print(padding_token_text)
#predict
    np.argmax(model.predict(padded_token_text))
    for word,index in tokenizer.word_index.items():
        if index == pos:
            text =text + " "+ word
            print(text)
            time.sleep(2)

In [None]:
tokenizer.word_index